File: //opt/bcm-agent/log/bcm-si.instance-rr9enuui.root.log.INFO.20260319-102916.543705
Log file created at: 2026/03/19 10:29:16
Running on machine: instance-rr9enuui
Binary: Built with gc go1.23.8 for linux/amd64
Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg
I0319 10:29:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:29:16.458167 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:29:16.458186 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:29:16.472517 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:29:18.691090 543705 disk_info.go:125] begin check local disk info of client
I0319 10:29:18.693461 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:29:18.693467 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007be00 0xc00007be40]
E0319 10:29:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:29:23.409775 543705 memory.go:184] no items to output this cycle
I0319 10:29:23.409779 543705 cpu.go:275] no items to output this cycle
E0319 10:29:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:29:33.409781 543705 memory.go:184] no items to output this cycle
I0319 10:29:33.409787 543705 cpu.go:275] no items to output this cycle
E0319 10:29:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:29:43.409808 543705 memory.go:191] Add success.
I0319 10:29:43.409821 543705 cpu.go:282] Add success.
I0319 10:29:43.419924 543705 net.go:648] Add success.
I0319 10:29:43.422685 543705 net.go:770] primary dev: ETH0
I0319 10:29:43.422699 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:29:43.422710 543705 net.go:698] Add success.
I0319 10:29:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:29:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:29:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:29:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:29:53.409798 543705 memory.go:184] no items to output this cycle
I0319 10:29:53.409802 543705 cpu.go:275] no items to output this cycle
E0319 10:30:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:30:03.409775 543705 memory.go:184] no items to output this cycle
I0319 10:30:03.409801 543705 cpu.go:275] no items to output this cycle
E0319 10:30:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:30:13.409792 543705 memory.go:191] Add success.
I0319 10:30:13.409798 543705 cpu.go:282] Add success.
W0319 10:30:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:30:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:30:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:30:13.420216 543705 net.go:648] Add success.
I0319 10:30:13.423031 543705 net.go:770] primary dev: ETH0
I0319 10:30:13.423045 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:30:13.423057 543705 net.go:698] Add success.
I0319 10:30:13.652533 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"dc5ca1b0-1e5d-4749-aaed-f8c5060d09a6","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 10:30:13.652565 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 10:30:14.454677 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:30:14.454843 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:30:14.454921 543705 disk_worker.go:708] disk space is not compliant
W0319 10:30:14.454924 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:30:14.456356 543705 disk_worker.go:494] system disk:vda1
I0319 10:30:14.456386 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:30:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:30:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:30:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:30:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:30:16.472393 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:30:18.693671 543705 disk_info.go:125] begin check local disk info of client
I0319 10:30:18.696009 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:30:18.696016 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004e6d80 0xc0004e6dc0]
E0319 10:30:23.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:30:23.409811 543705 memory.go:184] no items to output this cycle
I0319 10:30:23.409813 543705 cpu.go:275] no items to output this cycle
E0319 10:30:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:30:33.409780 543705 memory.go:184] no items to output this cycle
I0319 10:30:33.409785 543705 cpu.go:275] no items to output this cycle
I0319 10:30:37.683108 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 10:30:37.683114 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 10:30:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:30:43.410632 543705 memory.go:191] Add success.
I0319 10:30:43.409802 543705 cpu.go:282] Add success.
I0319 10:30:43.420387 543705 net.go:648] Add success.
I0319 10:30:43.422907 543705 net.go:770] primary dev: ETH0
I0319 10:30:43.422919 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:30:43.422931 543705 net.go:698] Add success.
I0319 10:30:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:30:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:30:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:30:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:30:53.409794 543705 memory.go:184] no items to output this cycle
I0319 10:30:53.409796 543705 cpu.go:275] no items to output this cycle
E0319 10:31:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:31:03.409783 543705 memory.go:184] no items to output this cycle
I0319 10:31:03.409803 543705 cpu.go:275] no items to output this cycle
E0319 10:31:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:31:13.409797 543705 memory.go:191] Add success.
I0319 10:31:13.409800 543705 cpu.go:282] Add success.
W0319 10:31:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:31:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:31:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:31:13.420078 543705 net.go:648] Add success.
I0319 10:31:13.422871 543705 net.go:770] primary dev: ETH0
I0319 10:31:13.422886 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:31:13.422900 543705 net.go:698] Add success.
I0319 10:31:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:31:14.455102 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:31:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0319 10:31:14.455182 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:31:14.456566 543705 disk_worker.go:494] system disk:vda1
I0319 10:31:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:31:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:31:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:31:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:31:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:31:16.472439 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:31:18.697125 543705 disk_info.go:125] begin check local disk info of client
I0319 10:31:18.699537 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:31:18.699543 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002724c0 0xc000272500]
E0319 10:31:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:31:23.409791 543705 memory.go:184] no items to output this cycle
I0319 10:31:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 10:31:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:31:33.409803 543705 memory.go:184] no items to output this cycle
I0319 10:31:33.409815 543705 cpu.go:275] no items to output this cycle
E0319 10:31:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:31:43.409819 543705 memory.go:191] Add success.
I0319 10:31:43.409860 543705 cpu.go:282] Add success.
I0319 10:31:43.420087 543705 net.go:648] Add success.
I0319 10:31:43.422800 543705 net.go:770] primary dev: ETH0
I0319 10:31:43.422814 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:31:43.422829 543705 net.go:698] Add success.
I0319 10:31:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:31:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:31:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:31:53.409885 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:31:53.409905 543705 memory.go:184] no items to output this cycle
I0319 10:31:53.410012 543705 cpu.go:275] no items to output this cycle
E0319 10:32:03.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:32:03.409811 543705 memory.go:184] no items to output this cycle
I0319 10:32:03.409825 543705 cpu.go:275] no items to output this cycle
E0319 10:32:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:32:13.409785 543705 memory.go:191] Add success.
W0319 10:32:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 10:32:13.409815 543705 cpu.go:282] Add success.
W0319 10:32:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:32:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:32:13.420171 543705 net.go:648] Add success.
I0319 10:32:13.423684 543705 net.go:770] primary dev: ETH0
I0319 10:32:13.423697 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:32:13.423709 543705 net.go:698] Add success.
W0319 10:32:14.455134 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:32:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0319 10:32:14.455201 543705 disk_worker.go:728] disk inode is not compliant
E0319 10:32:14.455900 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 10:32:14.455909 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 10:32:14.455915 543705 custom_config.go:64] query custom config with name: gpu
I0319 10:32:14.456553 543705 disk_worker.go:494] system disk:vda1
I0319 10:32:14.456588 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 10:32:15.456854 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 10:32:15.456863 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:32:16.457955 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 10:32:16.457966 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 10:32:16.458008 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:32:16.458026 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:32:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:32:18.700149 543705 disk_info.go:125] begin check local disk info of client
I0319 10:32:18.702485 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:32:18.702492 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035a6c0 0xc00035a700]
E0319 10:32:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:32:23.409795 543705 memory.go:184] no items to output this cycle
I0319 10:32:23.409856 543705 cpu.go:275] no items to output this cycle
E0319 10:32:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:32:33.409791 543705 memory.go:184] no items to output this cycle
I0319 10:32:33.409796 543705 cpu.go:275] no items to output this cycle
E0319 10:32:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:32:43.409795 543705 memory.go:191] Add success.
I0319 10:32:43.409828 543705 cpu.go:282] Add success.
I0319 10:32:43.419985 543705 net.go:648] Add success.
I0319 10:32:43.422777 543705 net.go:770] primary dev: ETH0
I0319 10:32:43.422791 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:32:43.422804 543705 net.go:698] Add success.
I0319 10:32:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:32:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:32:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:32:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:32:53.409804 543705 memory.go:184] no items to output this cycle
I0319 10:32:53.409822 543705 cpu.go:275] no items to output this cycle
E0319 10:33:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:33:03.409792 543705 memory.go:184] no items to output this cycle
I0319 10:33:03.409805 543705 cpu.go:275] no items to output this cycle
E0319 10:33:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:33:13.409793 543705 memory.go:191] Add success.
W0319 10:33:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:33:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:33:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:33:13.409850 543705 cpu.go:282] Add success.
I0319 10:33:13.420079 543705 net.go:648] Add success.
I0319 10:33:13.422840 543705 net.go:770] primary dev: ETH0
I0319 10:33:13.422853 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:33:13.422865 543705 net.go:698] Add success.
I0319 10:33:13.469183 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"df0e4c01-af13-4f76-8450-ef59bc717ada","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 10:33:13.469216 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 10:33:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:33:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:33:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0319 10:33:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:33:14.456614 543705 disk_worker.go:494] system disk:vda1
I0319 10:33:14.456647 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:33:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:33:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:33:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:33:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:33:16.472407 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:33:18.702573 543705 disk_info.go:125] begin check local disk info of client
I0319 10:33:18.705024 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:33:18.705031 543705 disk_info.go:196] parse disk info done, disk is : [0xc000390640 0xc000390680]
E0319 10:33:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:33:23.409776 543705 memory.go:184] no items to output this cycle
I0319 10:33:23.409811 543705 cpu.go:275] no items to output this cycle
E0319 10:33:33.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:33:33.409821 543705 memory.go:184] no items to output this cycle
I0319 10:33:33.409836 543705 cpu.go:275] no items to output this cycle
I0319 10:33:37.683261 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 10:33:37.683268 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 10:33:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:33:43.410669 543705 memory.go:191] Add success.
I0319 10:33:43.409811 543705 cpu.go:282] Add success.
I0319 10:33:43.420409 543705 net.go:648] Add success.
I0319 10:33:43.423222 543705 net.go:770] primary dev: ETH0
I0319 10:33:43.423235 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:33:43.423248 543705 net.go:698] Add success.
I0319 10:33:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:33:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:33:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:33:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:33:53.409784 543705 memory.go:184] no items to output this cycle
I0319 10:33:53.409826 543705 cpu.go:275] no items to output this cycle
E0319 10:34:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:34:03.409802 543705 memory.go:184] no items to output this cycle
I0319 10:34:03.409827 543705 cpu.go:275] no items to output this cycle
E0319 10:34:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:34:13.409812 543705 memory.go:191] Add success.
I0319 10:34:13.409832 543705 cpu.go:282] Add success.
W0319 10:34:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:34:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:34:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:34:13.420120 543705 net.go:648] Add success.
I0319 10:34:13.423199 543705 net.go:770] primary dev: ETH0
I0319 10:34:13.423212 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:34:13.423225 543705 net.go:698] Add success.
I0319 10:34:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:34:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:34:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 10:34:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:34:14.456561 543705 disk_worker.go:494] system disk:vda1
I0319 10:34:14.456592 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:34:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:34:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:34:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:34:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:34:16.472377 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:34:18.705673 543705 disk_info.go:125] begin check local disk info of client
I0319 10:34:18.708029 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:34:18.708034 543705 disk_info.go:196] parse disk info done, disk is : [0xc000315c80 0xc000315cc0]
E0319 10:34:23.409808 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:34:23.409814 543705 cpu.go:275] no items to output this cycle
I0319 10:34:23.409830 543705 memory.go:184] no items to output this cycle
E0319 10:34:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:34:33.409809 543705 memory.go:184] no items to output this cycle
I0319 10:34:33.409822 543705 cpu.go:275] no items to output this cycle
E0319 10:34:43.409878 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:34:43.409910 543705 memory.go:191] Add success.
I0319 10:34:43.409994 543705 cpu.go:282] Add success.
I0319 10:34:43.419708 543705 net.go:648] Add success.
I0319 10:34:43.422579 543705 net.go:770] primary dev: ETH0
I0319 10:34:43.422593 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:34:43.422605 543705 net.go:698] Add success.
I0319 10:34:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:34:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:34:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:34:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:34:53.409805 543705 memory.go:184] no items to output this cycle
I0319 10:34:53.409808 543705 cpu.go:275] no items to output this cycle
E0319 10:35:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:35:03.409777 543705 memory.go:184] no items to output this cycle
I0319 10:35:03.409786 543705 cpu.go:275] no items to output this cycle
E0319 10:35:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:35:13.409817 543705 memory.go:191] Add success.
I0319 10:35:13.409825 543705 cpu.go:282] Add success.
W0319 10:35:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:35:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:35:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:35:13.420073 543705 net.go:648] Add success.
I0319 10:35:13.423010 543705 net.go:770] primary dev: ETH0
I0319 10:35:13.423028 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:35:13.423048 543705 net.go:698] Add success.
I0319 10:35:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:35:14.455136 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:35:14.455226 543705 disk_worker.go:708] disk space is not compliant
W0319 10:35:14.455230 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:35:14.456631 543705 disk_worker.go:494] system disk:vda1
I0319 10:35:14.456663 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:35:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:35:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:35:16.458025 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:35:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:35:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:35:18.709170 543705 disk_info.go:125] begin check local disk info of client
I0319 10:35:18.711541 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:35:18.711547 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035c6c0 0xc00035c700]
E0319 10:35:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:35:23.409786 543705 memory.go:184] no items to output this cycle
I0319 10:35:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 10:35:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:35:33.409775 543705 memory.go:184] no items to output this cycle
I0319 10:35:33.409805 543705 cpu.go:275] no items to output this cycle
E0319 10:35:43.409825 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:35:43.409864 543705 memory.go:191] Add success.
I0319 10:35:43.409866 543705 cpu.go:282] Add success.
I0319 10:35:43.420022 543705 net.go:648] Add success.
I0319 10:35:43.422955 543705 net.go:770] primary dev: ETH0
I0319 10:35:43.422967 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:35:43.422981 543705 net.go:698] Add success.
I0319 10:35:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:35:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:35:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:35:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:35:53.409793 543705 memory.go:184] no items to output this cycle
I0319 10:35:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 10:36:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:36:03.409792 543705 memory.go:184] no items to output this cycle
I0319 10:36:03.409821 543705 cpu.go:275] no items to output this cycle
E0319 10:36:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:36:13.409789 543705 memory.go:191] Add success.
I0319 10:36:13.409790 543705 cpu.go:282] Add success.
W0319 10:36:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:36:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:36:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:36:13.420134 543705 net.go:648] Add success.
I0319 10:36:13.422802 543705 net.go:770] primary dev: ETH0
I0319 10:36:13.422814 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:36:13.422826 543705 net.go:698] Add success.
I0319 10:36:13.469294 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"28bc020a-8051-41dd-8b3d-4a3232e69270","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 10:36:13.469338 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 10:36:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:36:14.455118 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:36:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 10:36:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:36:14.456665 543705 disk_worker.go:494] system disk:vda1
I0319 10:36:14.456693 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:36:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:36:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:36:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:36:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:36:16.472397 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:36:18.712255 543705 disk_info.go:125] begin check local disk info of client
I0319 10:36:18.714735 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:36:18.714741 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034de40 0xc00034de80]
E0319 10:36:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:36:23.409797 543705 memory.go:184] no items to output this cycle
I0319 10:36:23.409808 543705 cpu.go:275] no items to output this cycle
E0319 10:36:33.409884 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:36:33.409917 543705 memory.go:184] no items to output this cycle
I0319 10:36:33.410010 543705 cpu.go:275] no items to output this cycle
I0319 10:36:37.683403 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 10:36:37.683410 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 10:36:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:36:43.410815 543705 memory.go:191] Add success.
I0319 10:36:43.409809 543705 cpu.go:282] Add success.
I0319 10:36:43.420542 543705 net.go:648] Add success.
I0319 10:36:43.423388 543705 net.go:770] primary dev: ETH0
I0319 10:36:43.423401 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:36:43.423413 543705 net.go:698] Add success.
I0319 10:36:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:36:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:36:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:36:53.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:36:53.409816 543705 memory.go:184] no items to output this cycle
I0319 10:36:53.409824 543705 cpu.go:275] no items to output this cycle
E0319 10:37:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:37:03.409780 543705 cpu.go:275] no items to output this cycle
I0319 10:37:03.409783 543705 memory.go:184] no items to output this cycle
E0319 10:37:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:37:13.409799 543705 memory.go:191] Add success.
I0319 10:37:13.409802 543705 cpu.go:282] Add success.
W0319 10:37:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:37:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:37:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:37:13.420188 543705 net.go:648] Add success.
I0319 10:37:13.422934 543705 net.go:770] primary dev: ETH0
I0319 10:37:13.422948 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:37:13.422959 543705 net.go:698] Add success.
I0319 10:37:13.453516 543705 event_worker.go:152] Polling the log file for events...
W0319 10:37:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:37:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0319 10:37:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:37:14.456809 543705 disk_worker.go:494] system disk:vda1
I0319 10:37:14.456853 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 10:37:14.457119 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 10:37:14.457127 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 10:37:14.457131 543705 custom_config.go:64] query custom config with name: gpu
E0319 10:37:15.456803 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 10:37:15.456811 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:37:16.457932 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 10:37:16.457932 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 10:37:16.458000 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:37:16.458019 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:37:16.472346 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:37:18.716217 543705 disk_info.go:125] begin check local disk info of client
I0319 10:37:18.718568 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:37:18.718574 543705 disk_info.go:196] parse disk info done, disk is : [0xc00057e0c0 0xc00057e100]
E0319 10:37:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:37:23.409801 543705 memory.go:184] no items to output this cycle
I0319 10:37:23.409810 543705 cpu.go:275] no items to output this cycle
E0319 10:37:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:37:33.409886 543705 memory.go:184] no items to output this cycle
I0319 10:37:33.409927 543705 cpu.go:275] no items to output this cycle
E0319 10:37:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:37:43.409805 543705 memory.go:191] Add success.
I0319 10:37:43.409809 543705 cpu.go:282] Add success.
I0319 10:37:43.419962 543705 net.go:648] Add success.
I0319 10:37:43.423190 543705 net.go:770] primary dev: ETH0
I0319 10:37:43.423205 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:37:43.423217 543705 net.go:698] Add success.
I0319 10:37:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:37:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:37:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:37:53.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:37:53.409819 543705 memory.go:184] no items to output this cycle
I0319 10:37:53.409825 543705 cpu.go:275] no items to output this cycle
E0319 10:38:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:38:03.409773 543705 memory.go:184] no items to output this cycle
I0319 10:38:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 10:38:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:38:13.409797 543705 memory.go:191] Add success.
I0319 10:38:13.409800 543705 cpu.go:282] Add success.
W0319 10:38:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:38:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:38:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:38:13.420081 543705 net.go:648] Add success.
I0319 10:38:13.422783 543705 net.go:770] primary dev: ETH0
I0319 10:38:13.422796 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:38:13.422809 543705 net.go:698] Add success.
I0319 10:38:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:38:14.455131 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:38:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0319 10:38:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:38:14.456577 543705 disk_worker.go:494] system disk:vda1
I0319 10:38:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:38:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:38:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:38:16.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:38:16.458083 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:38:16.472422 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:38:18.718652 543705 disk_info.go:125] begin check local disk info of client
I0319 10:38:18.721059 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:38:18.721064 543705 disk_info.go:196] parse disk info done, disk is : [0xc00027a640 0xc00027a680]
E0319 10:38:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:38:23.409775 543705 memory.go:184] no items to output this cycle
I0319 10:38:23.409777 543705 cpu.go:275] no items to output this cycle
E0319 10:38:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:38:33.409771 543705 memory.go:184] no items to output this cycle
I0319 10:38:33.409804 543705 cpu.go:275] no items to output this cycle
E0319 10:38:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:38:43.409827 543705 memory.go:191] Add success.
I0319 10:38:43.409836 543705 cpu.go:282] Add success.
I0319 10:38:43.419989 543705 net.go:648] Add success.
I0319 10:38:43.423105 543705 net.go:770] primary dev: ETH0
I0319 10:38:43.423118 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:38:43.423131 543705 net.go:698] Add success.
I0319 10:38:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:38:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:38:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:38:53.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:38:53.409817 543705 memory.go:184] no items to output this cycle
I0319 10:38:53.409819 543705 cpu.go:275] no items to output this cycle
E0319 10:39:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:39:03.409766 543705 memory.go:184] no items to output this cycle
I0319 10:39:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 10:39:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:39:13.409794 543705 memory.go:191] Add success.
I0319 10:39:13.409798 543705 cpu.go:282] Add success.
W0319 10:39:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:39:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:39:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:39:13.420113 543705 net.go:648] Add success.
I0319 10:39:13.422845 543705 net.go:770] primary dev: ETH0
I0319 10:39:13.422858 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:39:13.422871 543705 net.go:698] Add success.
I0319 10:39:13.468832 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8485047f-0d49-4956-8604-3b3c5e86858a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 10:39:13.468871 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 10:39:14.454954 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:39:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:39:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0319 10:39:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:39:14.456598 543705 disk_worker.go:494] system disk:vda1
I0319 10:39:14.456627 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:39:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:39:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:39:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:39:16.458049 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:39:16.472394 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:39:18.721671 543705 disk_info.go:125] begin check local disk info of client
I0319 10:39:18.724031 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:39:18.724037 543705 disk_info.go:196] parse disk info done, disk is : [0xc000295400 0xc000295440]
E0319 10:39:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:39:23.409799 543705 memory.go:184] no items to output this cycle
I0319 10:39:23.409806 543705 cpu.go:275] no items to output this cycle
E0319 10:39:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:39:33.409776 543705 memory.go:184] no items to output this cycle
I0319 10:39:33.409801 543705 cpu.go:275] no items to output this cycle
I0319 10:39:37.684119 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 10:39:37.684125 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 10:39:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:39:43.410606 543705 memory.go:191] Add success.
I0319 10:39:43.409804 543705 cpu.go:282] Add success.
I0319 10:39:43.420471 543705 net.go:648] Add success.
I0319 10:39:43.422928 543705 net.go:770] primary dev: ETH0
I0319 10:39:43.422942 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:39:43.422954 543705 net.go:698] Add success.
I0319 10:39:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:39:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:39:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:39:53.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:39:53.409816 543705 memory.go:184] no items to output this cycle
I0319 10:39:53.409826 543705 cpu.go:275] no items to output this cycle
E0319 10:40:03.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:40:03.409763 543705 memory.go:184] no items to output this cycle
I0319 10:40:03.409806 543705 cpu.go:275] no items to output this cycle
E0319 10:40:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:40:13.409798 543705 memory.go:191] Add success.
I0319 10:40:13.409801 543705 cpu.go:282] Add success.
W0319 10:40:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:40:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:40:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:40:13.420044 543705 net.go:648] Add success.
I0319 10:40:13.422890 543705 net.go:770] primary dev: ETH0
I0319 10:40:13.422903 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:40:13.422914 543705 net.go:698] Add success.
I0319 10:40:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:40:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:40:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0319 10:40:14.455217 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:40:14.456574 543705 disk_worker.go:494] system disk:vda1
I0319 10:40:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:40:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:40:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:40:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:40:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:40:16.472409 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:40:18.725264 543705 disk_info.go:125] begin check local disk info of client
I0319 10:40:18.727627 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:40:18.727633 543705 disk_info.go:196] parse disk info done, disk is : [0xc00027abc0 0xc00027ac00]
E0319 10:40:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:40:23.409779 543705 cpu.go:275] no items to output this cycle
I0319 10:40:23.409781 543705 memory.go:184] no items to output this cycle
E0319 10:40:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:40:33.409775 543705 memory.go:184] no items to output this cycle
I0319 10:40:33.409797 543705 cpu.go:275] no items to output this cycle
E0319 10:40:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:40:43.409798 543705 memory.go:191] Add success.
I0319 10:40:43.409799 543705 cpu.go:282] Add success.
I0319 10:40:43.419737 543705 net.go:648] Add success.
I0319 10:40:43.422496 543705 net.go:770] primary dev: ETH0
I0319 10:40:43.422508 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:40:43.422520 543705 net.go:698] Add success.
I0319 10:40:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:40:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:40:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:40:53.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:40:53.409819 543705 memory.go:184] no items to output this cycle
I0319 10:40:53.409825 543705 cpu.go:275] no items to output this cycle
E0319 10:41:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:41:03.409800 543705 memory.go:184] no items to output this cycle
I0319 10:41:03.409812 543705 cpu.go:275] no items to output this cycle
E0319 10:41:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:41:13.409812 543705 memory.go:191] Add success.
I0319 10:41:13.409822 543705 cpu.go:282] Add success.
W0319 10:41:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:41:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:41:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:41:13.420170 543705 net.go:648] Add success.
I0319 10:41:13.422740 543705 net.go:770] primary dev: ETH0
I0319 10:41:13.422754 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:41:13.422766 543705 net.go:698] Add success.
I0319 10:41:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:41:14.455185 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:41:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0319 10:41:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:41:14.456579 543705 disk_worker.go:494] system disk:vda1
I0319 10:41:14.456606 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:41:15.455975 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:41:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:41:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:41:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:41:16.472385 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:41:18.728292 543705 disk_info.go:125] begin check local disk info of client
I0319 10:41:18.730703 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:41:18.730708 543705 disk_info.go:196] parse disk info done, disk is : [0xc000539640 0xc000539680]
E0319 10:41:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:41:23.409796 543705 memory.go:184] no items to output this cycle
I0319 10:41:23.409810 543705 cpu.go:275] no items to output this cycle
E0319 10:41:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:41:33.409791 543705 memory.go:184] no items to output this cycle
I0319 10:41:33.409795 543705 cpu.go:275] no items to output this cycle
E0319 10:41:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:41:43.409897 543705 cpu.go:282] Add success.
I0319 10:41:43.409904 543705 memory.go:191] Add success.
I0319 10:41:43.419713 543705 net.go:648] Add success.
I0319 10:41:43.422532 543705 net.go:770] primary dev: ETH0
I0319 10:41:43.422544 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:41:43.422555 543705 net.go:698] Add success.
I0319 10:41:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:41:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:41:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:41:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:41:53.409785 543705 memory.go:184] no items to output this cycle
I0319 10:41:53.409815 543705 cpu.go:275] no items to output this cycle
E0319 10:42:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:42:03.409765 543705 memory.go:184] no items to output this cycle
I0319 10:42:03.409807 543705 cpu.go:275] no items to output this cycle
E0319 10:42:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:42:13.409796 543705 memory.go:191] Add success.
I0319 10:42:13.409796 543705 cpu.go:282] Add success.
W0319 10:42:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:42:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:42:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:42:13.420123 543705 net.go:648] Add success.
I0319 10:42:13.423008 543705 net.go:770] primary dev: ETH0
I0319 10:42:13.423022 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:42:13.423034 543705 net.go:698] Add success.
I0319 10:42:13.469652 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7af4c89c-a15f-48ff-89c9-1082ca0719ab","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 10:42:13.469690 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 10:42:14.455229 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:42:14.455243 543705 disk_worker.go:708] disk space is not compliant
W0319 10:42:14.455248 543705 disk_worker.go:728] disk inode is not compliant
E0319 10:42:14.455964 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 10:42:14.455973 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 10:42:14.455979 543705 custom_config.go:64] query custom config with name: gpu
I0319 10:42:14.456863 543705 disk_worker.go:494] system disk:vda1
I0319 10:42:14.456891 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 10:42:15.456843 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 10:42:15.456853 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:42:16.457938 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 10:42:16.457947 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 10:42:16.457991 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:42:16.458007 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:42:16.472331 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:42:18.731357 543705 disk_info.go:125] begin check local disk info of client
I0319 10:42:18.733664 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:42:18.733670 543705 disk_info.go:196] parse disk info done, disk is : [0xc000377bc0 0xc000377c00]
E0319 10:42:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:42:23.409792 543705 memory.go:184] no items to output this cycle
I0319 10:42:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 10:42:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:42:33.409773 543705 memory.go:184] no items to output this cycle
I0319 10:42:33.409805 543705 cpu.go:275] no items to output this cycle
I0319 10:42:37.685130 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 10:42:37.685136 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 10:42:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:42:43.410936 543705 memory.go:191] Add success.
I0319 10:42:43.409825 543705 cpu.go:282] Add success.
I0319 10:42:43.420663 543705 net.go:648] Add success.
I0319 10:42:43.423657 543705 net.go:770] primary dev: ETH0
I0319 10:42:43.423670 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:42:43.423683 543705 net.go:698] Add success.
I0319 10:42:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:42:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:42:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:42:53.409803 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:42:53.409825 543705 memory.go:184] no items to output this cycle
I0319 10:42:53.409836 543705 cpu.go:275] no items to output this cycle
E0319 10:43:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:43:03.409805 543705 memory.go:184] no items to output this cycle
I0319 10:43:03.409818 543705 cpu.go:275] no items to output this cycle
E0319 10:43:13.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:43:13.409827 543705 memory.go:191] Add success.
I0319 10:43:13.409836 543705 cpu.go:282] Add success.
W0319 10:43:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:43:13.409877 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:43:13.409881 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:43:13.420145 543705 net.go:648] Add success.
I0319 10:43:13.422749 543705 net.go:770] primary dev: ETH0
I0319 10:43:13.422763 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:43:13.422776 543705 net.go:698] Add success.
I0319 10:43:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:43:14.455126 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:43:14.455211 543705 disk_worker.go:708] disk space is not compliant
W0319 10:43:14.455214 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:43:14.456622 543705 disk_worker.go:494] system disk:vda1
I0319 10:43:14.456657 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:43:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:43:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:43:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:43:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:43:16.472404 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:43:18.733751 543705 disk_info.go:125] begin check local disk info of client
I0319 10:43:18.736130 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:43:18.736136 543705 disk_info.go:196] parse disk info done, disk is : [0xc000377340 0xc000377380]
E0319 10:43:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:43:23.409806 543705 memory.go:184] no items to output this cycle
I0319 10:43:23.409813 543705 cpu.go:275] no items to output this cycle
E0319 10:43:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:43:33.409792 543705 memory.go:184] no items to output this cycle
I0319 10:43:33.409794 543705 cpu.go:275] no items to output this cycle
E0319 10:43:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:43:43.409802 543705 cpu.go:282] Add success.
I0319 10:43:43.409809 543705 memory.go:191] Add success.
I0319 10:43:43.419956 543705 net.go:648] Add success.
I0319 10:43:43.422593 543705 net.go:770] primary dev: ETH0
I0319 10:43:43.422607 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:43:43.422618 543705 net.go:698] Add success.
I0319 10:43:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:43:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:43:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:43:53.409803 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:43:53.409825 543705 memory.go:184] no items to output this cycle
I0319 10:43:53.409826 543705 cpu.go:275] no items to output this cycle
E0319 10:44:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:44:03.409782 543705 memory.go:184] no items to output this cycle
I0319 10:44:03.409801 543705 cpu.go:275] no items to output this cycle
E0319 10:44:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:44:13.409787 543705 memory.go:191] Add success.
I0319 10:44:13.409805 543705 cpu.go:282] Add success.
W0319 10:44:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:44:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:44:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:44:13.420323 543705 net.go:648] Add success.
I0319 10:44:13.423236 543705 net.go:770] primary dev: ETH0
I0319 10:44:13.423249 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:44:13.423261 543705 net.go:698] Add success.
I0319 10:44:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:44:14.455165 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:44:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0319 10:44:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:44:14.456522 543705 disk_worker.go:494] system disk:vda1
I0319 10:44:14.456570 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:44:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:44:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:44:16.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:44:16.458080 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:44:16.472407 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:44:18.736216 543705 disk_info.go:125] begin check local disk info of client
I0319 10:44:18.738672 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:44:18.738678 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aafc0 0xc0001ab000]
E0319 10:44:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:44:23.409763 543705 memory.go:184] no items to output this cycle
I0319 10:44:23.409793 543705 cpu.go:275] no items to output this cycle
E0319 10:44:33.409861 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:44:33.409889 543705 memory.go:184] no items to output this cycle
I0319 10:44:33.409969 543705 cpu.go:275] no items to output this cycle
E0319 10:44:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:44:43.409798 543705 memory.go:191] Add success.
I0319 10:44:43.409801 543705 cpu.go:282] Add success.
I0319 10:44:43.419925 543705 net.go:648] Add success.
I0319 10:44:43.422782 543705 net.go:770] primary dev: ETH0
I0319 10:44:43.422797 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:44:43.422812 543705 net.go:698] Add success.
I0319 10:44:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:44:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:44:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:44:53.409804 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:44:53.409809 543705 cpu.go:275] no items to output this cycle
I0319 10:44:53.409825 543705 memory.go:184] no items to output this cycle
E0319 10:45:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:45:03.409802 543705 memory.go:184] no items to output this cycle
I0319 10:45:03.409817 543705 cpu.go:275] no items to output this cycle
E0319 10:45:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:45:13.409816 543705 memory.go:191] Add success.
I0319 10:45:13.409824 543705 cpu.go:282] Add success.
W0319 10:45:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:45:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:45:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:45:13.420122 543705 net.go:648] Add success.
I0319 10:45:13.423005 543705 net.go:770] primary dev: ETH0
I0319 10:45:13.423020 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:45:13.423034 543705 net.go:698] Add success.
I0319 10:45:13.464265 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"44cc1f5c-b8c6-4dd0-8800-b0c81fb84af8","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 10:45:13.464300 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 10:45:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:45:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:45:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 10:45:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:45:14.456712 543705 disk_worker.go:494] system disk:vda1
I0319 10:45:14.456745 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:45:15.455614 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:45:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:45:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:45:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:45:16.472367 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:45:18.738758 543705 disk_info.go:125] begin check local disk info of client
I0319 10:45:18.741156 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:45:18.741163 543705 disk_info.go:196] parse disk info done, disk is : [0xc000396dc0 0xc000396e00]
E0319 10:45:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:45:23.409763 543705 memory.go:184] no items to output this cycle
I0319 10:45:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 10:45:33.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:45:33.409815 543705 memory.go:184] no items to output this cycle
I0319 10:45:33.409826 543705 cpu.go:275] no items to output this cycle
I0319 10:45:37.685735 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 10:45:37.685743 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 10:45:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:45:43.410625 543705 memory.go:191] Add success.
I0319 10:45:43.409834 543705 cpu.go:282] Add success.
I0319 10:45:43.420174 543705 net.go:770] primary dev: ETH0
I0319 10:45:43.420187 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:45:43.420201 543705 net.go:698] Add success.
I0319 10:45:43.420433 543705 net.go:648] Add success.
I0319 10:45:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:45:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:45:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:45:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:45:53.409790 543705 memory.go:184] no items to output this cycle
I0319 10:45:53.409855 543705 cpu.go:275] no items to output this cycle
E0319 10:46:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:46:03.409790 543705 memory.go:184] no items to output this cycle
I0319 10:46:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 10:46:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:46:13.409817 543705 memory.go:191] Add success.
I0319 10:46:13.409828 543705 cpu.go:282] Add success.
W0319 10:46:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:46:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:46:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:46:13.420029 543705 net.go:648] Add success.
I0319 10:46:13.422774 543705 net.go:770] primary dev: ETH0
I0319 10:46:13.422787 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:46:13.422800 543705 net.go:698] Add success.
I0319 10:46:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:46:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:46:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0319 10:46:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:46:14.456562 543705 disk_worker.go:494] system disk:vda1
I0319 10:46:14.456591 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:46:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:46:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:46:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:46:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:46:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:46:18.741710 543705 disk_info.go:125] begin check local disk info of client
I0319 10:46:18.744067 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:46:18.744072 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002736c0 0xc000273700]
E0319 10:46:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:46:23.409802 543705 memory.go:184] no items to output this cycle
I0319 10:46:23.409814 543705 cpu.go:275] no items to output this cycle
E0319 10:46:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:46:33.409783 543705 memory.go:184] no items to output this cycle
I0319 10:46:33.409806 543705 cpu.go:275] no items to output this cycle
E0319 10:46:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:46:43.409793 543705 memory.go:191] Add success.
I0319 10:46:43.409825 543705 cpu.go:282] Add success.
I0319 10:46:43.419811 543705 net.go:770] primary dev: ETH0
I0319 10:46:43.419823 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:46:43.419836 543705 net.go:698] Add success.
I0319 10:46:43.420063 543705 net.go:648] Add success.
I0319 10:46:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:46:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:46:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:46:53.409814 543705 cpu.go:275] no items to output this cycle
E0319 10:46:53.409820 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:46:53.409840 543705 memory.go:184] no items to output this cycle
E0319 10:47:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:47:03.409780 543705 memory.go:184] no items to output this cycle
I0319 10:47:03.409816 543705 cpu.go:275] no items to output this cycle
E0319 10:47:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:47:13.409799 543705 memory.go:191] Add success.
I0319 10:47:13.409799 543705 cpu.go:282] Add success.
W0319 10:47:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:47:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:47:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:47:13.420065 543705 net.go:648] Add success.
I0319 10:47:13.422711 543705 net.go:770] primary dev: ETH0
I0319 10:47:13.422725 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:47:13.422736 543705 net.go:698] Add success.
I0319 10:47:13.453258 543705 event_worker.go:152] Polling the log file for events...
W0319 10:47:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:47:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0319 10:47:14.455172 543705 disk_worker.go:728] disk inode is not compliant
E0319 10:47:14.456915 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 10:47:14.456924 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 10:47:14.456930 543705 custom_config.go:64] query custom config with name: gpu
I0319 10:47:14.456979 543705 disk_worker.go:494] system disk:vda1
I0319 10:47:14.457021 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 10:47:15.456822 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 10:47:15.456830 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:47:16.457926 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 10:47:16.457926 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 10:47:16.457983 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:47:16.458003 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:47:16.472325 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:47:18.745366 543705 disk_info.go:125] begin check local disk info of client
I0319 10:47:18.747744 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:47:18.747750 543705 disk_info.go:196] parse disk info done, disk is : [0xc000298d40 0xc000298d80]
E0319 10:47:23.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:47:23.409803 543705 memory.go:184] no items to output this cycle
I0319 10:47:23.409810 543705 cpu.go:275] no items to output this cycle
E0319 10:47:33.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:47:33.409815 543705 memory.go:184] no items to output this cycle
I0319 10:47:33.409832 543705 cpu.go:275] no items to output this cycle
E0319 10:47:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:47:43.409882 543705 memory.go:191] Add success.
I0319 10:47:43.409926 543705 cpu.go:282] Add success.
I0319 10:47:43.419719 543705 net.go:648] Add success.
I0319 10:47:43.422824 543705 net.go:770] primary dev: ETH0
I0319 10:47:43.422837 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:47:43.422849 543705 net.go:698] Add success.
I0319 10:47:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:47:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:47:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:47:53.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:47:53.409818 543705 cpu.go:275] no items to output this cycle
I0319 10:47:53.409827 543705 memory.go:184] no items to output this cycle
E0319 10:48:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:48:03.409791 543705 memory.go:184] no items to output this cycle
I0319 10:48:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 10:48:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:48:13.409795 543705 memory.go:191] Add success.
I0319 10:48:13.409796 543705 cpu.go:282] Add success.
W0319 10:48:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:48:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:48:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:48:13.419886 543705 net.go:770] primary dev: ETH0
I0319 10:48:13.419900 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:48:13.419911 543705 net.go:698] Add success.
I0319 10:48:13.420286 543705 net.go:648] Add success.
I0319 10:48:13.468776 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"76e0493f-e054-461e-ab0b-6d79222ab208","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 10:48:13.468807 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 10:48:14.454943 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:48:14.455094 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:48:14.455157 543705 disk_worker.go:708] disk space is not compliant
W0319 10:48:14.455159 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:48:14.456546 543705 disk_worker.go:494] system disk:vda1
I0319 10:48:14.456587 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:48:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:48:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:48:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:48:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:48:16.472355 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:48:18.748446 543705 disk_info.go:125] begin check local disk info of client
I0319 10:48:18.750841 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:48:18.750847 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001abac0 0xc0001abb00]
E0319 10:48:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:48:23.409802 543705 memory.go:184] no items to output this cycle
I0319 10:48:23.409818 543705 cpu.go:275] no items to output this cycle
E0319 10:48:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:48:33.409786 543705 memory.go:184] no items to output this cycle
I0319 10:48:33.409807 543705 cpu.go:275] no items to output this cycle
I0319 10:48:37.685875 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 10:48:37.685882 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 10:48:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:48:43.409803 543705 cpu.go:282] Add success.
I0319 10:48:43.410796 543705 memory.go:191] Add success.
I0319 10:48:43.419509 543705 net.go:770] primary dev: ETH0
I0319 10:48:43.419524 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:48:43.419538 543705 net.go:698] Add success.
I0319 10:48:43.419887 543705 net.go:648] Add success.
I0319 10:48:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:48:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:48:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:48:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:48:53.409791 543705 memory.go:184] no items to output this cycle
I0319 10:48:53.409852 543705 cpu.go:275] no items to output this cycle
E0319 10:49:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:49:03.409789 543705 memory.go:184] no items to output this cycle
I0319 10:49:03.409819 543705 cpu.go:275] no items to output this cycle
E0319 10:49:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:49:13.409796 543705 memory.go:191] Add success.
I0319 10:49:13.409798 543705 cpu.go:282] Add success.
W0319 10:49:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:49:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:49:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:49:13.420131 543705 net.go:648] Add success.
I0319 10:49:13.423190 543705 net.go:770] primary dev: ETH0
I0319 10:49:13.423203 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:49:13.423215 543705 net.go:698] Add success.
I0319 10:49:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:49:14.455201 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:49:14.455211 543705 disk_worker.go:708] disk space is not compliant
W0319 10:49:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:49:14.456575 543705 disk_worker.go:494] system disk:vda1
I0319 10:49:14.456605 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:49:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:49:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:49:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:49:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:49:16.472470 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:49:18.752396 543705 disk_info.go:125] begin check local disk info of client
I0319 10:49:18.754823 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:49:18.754829 543705 disk_info.go:196] parse disk info done, disk is : [0xc000298680 0xc0002986c0]
E0319 10:49:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:49:23.409770 543705 memory.go:184] no items to output this cycle
I0319 10:49:23.409793 543705 cpu.go:275] no items to output this cycle
E0319 10:49:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:49:33.409816 543705 memory.go:184] no items to output this cycle
I0319 10:49:33.409828 543705 cpu.go:275] no items to output this cycle
E0319 10:49:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:49:43.409827 543705 memory.go:191] Add success.
I0319 10:49:43.409832 543705 cpu.go:282] Add success.
I0319 10:49:43.420024 543705 net.go:648] Add success.
I0319 10:49:43.423362 543705 net.go:770] primary dev: ETH0
I0319 10:49:43.423376 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:49:43.423390 543705 net.go:698] Add success.
I0319 10:49:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:49:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:49:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:49:53.410584 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:49:53.410630 543705 memory.go:184] no items to output this cycle
I0319 10:49:53.410714 543705 cpu.go:275] no items to output this cycle
E0319 10:50:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:50:03.409792 543705 memory.go:184] no items to output this cycle
I0319 10:50:03.409801 543705 cpu.go:275] no items to output this cycle
E0319 10:50:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:50:13.409817 543705 memory.go:191] Add success.
I0319 10:50:13.409825 543705 cpu.go:282] Add success.
W0319 10:50:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:50:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:50:13.409878 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:50:13.420110 543705 net.go:648] Add success.
I0319 10:50:13.422993 543705 net.go:770] primary dev: ETH0
I0319 10:50:13.423006 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:50:13.423019 543705 net.go:698] Add success.
I0319 10:50:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:50:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:50:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0319 10:50:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:50:14.456567 543705 disk_worker.go:494] system disk:vda1
I0319 10:50:14.456601 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:50:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:50:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:50:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:50:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:50:16.472405 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:50:18.754912 543705 disk_info.go:125] begin check local disk info of client
I0319 10:50:18.757318 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:50:18.757324 543705 disk_info.go:196] parse disk info done, disk is : [0xc000331b00 0xc000331b40]
E0319 10:50:23.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:50:23.409759 543705 memory.go:184] no items to output this cycle
I0319 10:50:23.409797 543705 cpu.go:275] no items to output this cycle
E0319 10:50:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:50:33.409789 543705 memory.go:184] no items to output this cycle
I0319 10:50:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 10:50:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:50:43.409814 543705 memory.go:191] Add success.
I0319 10:50:43.409822 543705 cpu.go:282] Add success.
I0319 10:50:43.419965 543705 net.go:648] Add success.
I0319 10:50:43.423107 543705 net.go:770] primary dev: ETH0
I0319 10:50:43.423122 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:50:43.423134 543705 net.go:698] Add success.
I0319 10:50:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:50:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:50:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:50:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:50:53.409776 543705 memory.go:184] no items to output this cycle
I0319 10:50:53.409778 543705 cpu.go:275] no items to output this cycle
E0319 10:51:03.409864 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:51:03.409887 543705 memory.go:184] no items to output this cycle
I0319 10:51:03.409989 543705 cpu.go:275] no items to output this cycle
E0319 10:51:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:51:13.409804 543705 memory.go:191] Add success.
I0319 10:51:13.409808 543705 cpu.go:282] Add success.
W0319 10:51:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:51:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:51:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:51:13.420043 543705 net.go:648] Add success.
I0319 10:51:13.422860 543705 net.go:770] primary dev: ETH0
I0319 10:51:13.422873 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:51:13.422885 543705 net.go:698] Add success.
I0319 10:51:13.473321 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"21946fda-14d4-45c7-b5d0-19e5f927c7fb","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 10:51:13.473355 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 10:51:14.454984 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:51:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:51:14.455241 543705 disk_worker.go:708] disk space is not compliant
W0319 10:51:14.455244 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:51:14.456779 543705 disk_worker.go:494] system disk:vda1
I0319 10:51:14.456810 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:51:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:51:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:51:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:51:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:51:16.472405 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:51:18.757674 543705 disk_info.go:125] begin check local disk info of client
I0319 10:51:18.760057 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:51:18.760063 543705 disk_info.go:196] parse disk info done, disk is : [0xc000331a80 0xc000331ac0]
E0319 10:51:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:51:23.409772 543705 memory.go:184] no items to output this cycle
I0319 10:51:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 10:51:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:51:33.409800 543705 memory.go:184] no items to output this cycle
I0319 10:51:33.409804 543705 cpu.go:275] no items to output this cycle
I0319 10:51:37.686021 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 10:51:37.686028 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 10:51:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:51:43.410593 543705 memory.go:191] Add success.
I0319 10:51:43.409812 543705 cpu.go:282] Add success.
I0319 10:51:43.420288 543705 net.go:648] Add success.
I0319 10:51:43.423001 543705 net.go:770] primary dev: ETH0
I0319 10:51:43.423014 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:51:43.423026 543705 net.go:698] Add success.
I0319 10:51:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:51:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:51:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:51:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:51:53.409789 543705 memory.go:184] no items to output this cycle
I0319 10:51:53.409790 543705 cpu.go:275] no items to output this cycle
E0319 10:52:03.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:52:03.409813 543705 memory.go:184] no items to output this cycle
I0319 10:52:03.409815 543705 cpu.go:275] no items to output this cycle
E0319 10:52:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:52:13.409800 543705 cpu.go:282] Add success.
I0319 10:52:13.409805 543705 memory.go:191] Add success.
W0319 10:52:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:52:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:52:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:52:13.420065 543705 net.go:648] Add success.
I0319 10:52:13.422884 543705 net.go:770] primary dev: ETH0
I0319 10:52:13.422897 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:52:13.422909 543705 net.go:698] Add success.
W0319 10:52:14.455140 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:52:14.455150 543705 disk_worker.go:708] disk space is not compliant
W0319 10:52:14.455153 543705 disk_worker.go:728] disk inode is not compliant
E0319 10:52:14.456930 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 10:52:14.456940 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 10:52:14.456946 543705 custom_config.go:64] query custom config with name: gpu
I0319 10:52:14.457003 543705 disk_worker.go:494] system disk:vda1
I0319 10:52:14.457034 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 10:52:15.456849 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 10:52:15.456858 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:52:16.457927 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 10:52:16.457937 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 10:52:16.457978 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:52:16.457996 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:52:16.472339 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:52:18.760144 543705 disk_info.go:125] begin check local disk info of client
I0319 10:52:18.762521 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:52:18.762527 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5c00 0xc0000c5cc0]
E0319 10:52:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:52:23.409775 543705 memory.go:184] no items to output this cycle
I0319 10:52:23.409796 543705 cpu.go:275] no items to output this cycle
E0319 10:52:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:52:33.409796 543705 memory.go:184] no items to output this cycle
I0319 10:52:33.409800 543705 cpu.go:275] no items to output this cycle
E0319 10:52:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:52:43.409787 543705 memory.go:191] Add success.
I0319 10:52:43.409815 543705 cpu.go:282] Add success.
I0319 10:52:43.419882 543705 net.go:648] Add success.
I0319 10:52:43.422586 543705 net.go:770] primary dev: ETH0
I0319 10:52:43.422601 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:52:43.422614 543705 net.go:698] Add success.
I0319 10:52:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:52:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:52:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:52:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:52:53.409777 543705 memory.go:184] no items to output this cycle
I0319 10:52:53.409781 543705 cpu.go:275] no items to output this cycle
E0319 10:53:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:53:03.409816 543705 memory.go:184] no items to output this cycle
I0319 10:53:03.409837 543705 cpu.go:275] no items to output this cycle
E0319 10:53:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:53:13.409802 543705 memory.go:191] Add success.
I0319 10:53:13.409803 543705 cpu.go:282] Add success.
W0319 10:53:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:53:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:53:13.409856 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:53:13.420196 543705 net.go:648] Add success.
I0319 10:53:13.422891 543705 net.go:770] primary dev: ETH0
I0319 10:53:13.422906 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:53:13.422920 543705 net.go:698] Add success.
I0319 10:53:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:53:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:53:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0319 10:53:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:53:14.456569 543705 disk_worker.go:494] system disk:vda1
I0319 10:53:14.456598 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:53:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:53:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:53:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:53:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:53:16.472444 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:53:18.763456 543705 disk_info.go:125] begin check local disk info of client
I0319 10:53:18.765814 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:53:18.765835 543705 disk_info.go:196] parse disk info done, disk is : [0xc000331bc0 0xc000331c00]
E0319 10:53:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:53:23.409792 543705 memory.go:184] no items to output this cycle
I0319 10:53:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 10:53:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:53:33.409802 543705 memory.go:184] no items to output this cycle
I0319 10:53:33.409815 543705 cpu.go:275] no items to output this cycle
E0319 10:53:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:53:43.409782 543705 memory.go:191] Add success.
I0319 10:53:43.409816 543705 cpu.go:282] Add success.
I0319 10:53:43.419891 543705 net.go:648] Add success.
I0319 10:53:43.423419 543705 net.go:770] primary dev: ETH0
I0319 10:53:43.423432 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:53:43.423444 543705 net.go:698] Add success.
I0319 10:53:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:53:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:53:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:53:53.410256 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:53:53.410273 543705 memory.go:184] no items to output this cycle
I0319 10:53:53.410272 543705 cpu.go:275] no items to output this cycle
E0319 10:54:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:54:03.409780 543705 memory.go:184] no items to output this cycle
I0319 10:54:03.409786 543705 cpu.go:275] no items to output this cycle
E0319 10:54:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:54:13.409793 543705 memory.go:191] Add success.
I0319 10:54:13.409811 543705 cpu.go:282] Add success.
W0319 10:54:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:54:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:54:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:54:13.420120 543705 net.go:648] Add success.
I0319 10:54:13.423023 543705 net.go:770] primary dev: ETH0
I0319 10:54:13.423039 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:54:13.423051 543705 net.go:698] Add success.
I0319 10:54:13.474541 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"359f2aba-95e3-4198-aacf-bf533b8d180b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 10:54:13.474575 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 10:54:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:54:14.455100 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:54:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0319 10:54:14.455165 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:54:14.456517 543705 disk_worker.go:494] system disk:vda1
I0319 10:54:14.456571 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:54:15.455974 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:54:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:54:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:54:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:54:16.472437 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:54:18.766725 543705 disk_info.go:125] begin check local disk info of client
I0319 10:54:18.769153 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:54:18.769159 543705 disk_info.go:196] parse disk info done, disk is : [0xc000582980 0xc0005829c0]
E0319 10:54:23.410262 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:54:23.410278 543705 memory.go:184] no items to output this cycle
I0319 10:54:23.410291 543705 cpu.go:275] no items to output this cycle
E0319 10:54:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:54:33.409803 543705 memory.go:184] no items to output this cycle
I0319 10:54:33.409812 543705 cpu.go:275] no items to output this cycle
I0319 10:54:37.687141 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 10:54:37.687147 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 10:54:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:54:43.410676 543705 memory.go:191] Add success.
I0319 10:54:43.409827 543705 cpu.go:282] Add success.
I0319 10:54:43.420389 543705 net.go:648] Add success.
I0319 10:54:43.423128 543705 net.go:770] primary dev: ETH0
I0319 10:54:43.423143 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:54:43.423158 543705 net.go:698] Add success.
I0319 10:54:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:54:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:54:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:54:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:54:53.409778 543705 memory.go:184] no items to output this cycle
I0319 10:54:53.409797 543705 cpu.go:275] no items to output this cycle
E0319 10:55:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:55:03.409809 543705 memory.go:184] no items to output this cycle
I0319 10:55:03.409820 543705 cpu.go:275] no items to output this cycle
W0319 10:55:13.409714 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:55:13.409730 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:55:13.409734 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0319 10:55:13.409806 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:55:13.409819 543705 cpu.go:282] Add success.
I0319 10:55:13.409823 543705 memory.go:191] Add success.
I0319 10:55:13.420147 543705 net.go:648] Add success.
I0319 10:55:13.422997 543705 net.go:770] primary dev: ETH0
I0319 10:55:13.423012 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:55:13.423025 543705 net.go:698] Add success.
I0319 10:55:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:55:14.455212 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:55:14.455223 543705 disk_worker.go:708] disk space is not compliant
W0319 10:55:14.455226 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:55:14.456595 543705 disk_worker.go:494] system disk:vda1
I0319 10:55:14.456624 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:55:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:55:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:55:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:55:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:55:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:55:18.769673 543705 disk_info.go:125] begin check local disk info of client
I0319 10:55:18.772087 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:55:18.772093 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004f6600 0xc0004f6640]
E0319 10:55:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:55:23.409766 543705 memory.go:184] no items to output this cycle
I0319 10:55:23.409795 543705 cpu.go:275] no items to output this cycle
E0319 10:55:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:55:33.409804 543705 memory.go:184] no items to output this cycle
I0319 10:55:33.409819 543705 cpu.go:275] no items to output this cycle
E0319 10:55:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:55:43.409786 543705 memory.go:191] Add success.
I0319 10:55:43.409803 543705 cpu.go:282] Add success.
I0319 10:55:43.419871 543705 net.go:648] Add success.
I0319 10:55:43.422751 543705 net.go:770] primary dev: ETH0
I0319 10:55:43.422765 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:55:43.422776 543705 net.go:698] Add success.
I0319 10:55:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:55:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:55:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:55:53.409802 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:55:53.409819 543705 memory.go:184] no items to output this cycle
I0319 10:55:53.409830 543705 cpu.go:275] no items to output this cycle
E0319 10:56:03.409868 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:56:03.409891 543705 memory.go:184] no items to output this cycle
I0319 10:56:03.409999 543705 cpu.go:275] no items to output this cycle
E0319 10:56:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:56:13.409798 543705 memory.go:191] Add success.
I0319 10:56:13.409800 543705 cpu.go:282] Add success.
W0319 10:56:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:56:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:56:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:56:13.420146 543705 net.go:648] Add success.
I0319 10:56:13.422709 543705 net.go:770] primary dev: ETH0
I0319 10:56:13.422721 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:56:13.422733 543705 net.go:698] Add success.
I0319 10:56:14.454983 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:56:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:56:14.455236 543705 disk_worker.go:708] disk space is not compliant
W0319 10:56:14.455239 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:56:14.456602 543705 disk_worker.go:494] system disk:vda1
I0319 10:56:14.456633 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:56:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:56:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:56:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:56:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:56:16.472386 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:56:18.773506 543705 disk_info.go:125] begin check local disk info of client
I0319 10:56:18.775830 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:56:18.775836 543705 disk_info.go:196] parse disk info done, disk is : [0xc000330bc0 0xc000330c00]
E0319 10:56:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:56:23.409794 543705 memory.go:184] no items to output this cycle
I0319 10:56:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 10:56:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:56:33.409788 543705 memory.go:184] no items to output this cycle
I0319 10:56:33.409788 543705 cpu.go:275] no items to output this cycle
E0319 10:56:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:56:43.409791 543705 memory.go:191] Add success.
I0319 10:56:43.409795 543705 cpu.go:282] Add success.
I0319 10:56:43.419888 543705 net.go:648] Add success.
I0319 10:56:43.422642 543705 net.go:770] primary dev: ETH0
I0319 10:56:43.422657 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:56:43.422672 543705 net.go:698] Add success.
I0319 10:56:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:56:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:56:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:56:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:56:53.409779 543705 cpu.go:275] no items to output this cycle
I0319 10:56:53.409780 543705 memory.go:184] no items to output this cycle
E0319 10:57:03.409881 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:57:03.409909 543705 memory.go:184] no items to output this cycle
I0319 10:57:03.409886 543705 cpu.go:275] no items to output this cycle
E0319 10:57:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:57:13.409781 543705 memory.go:191] Add success.
W0319 10:57:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 10:57:13.409813 543705 cpu.go:282] Add success.
W0319 10:57:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:57:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:57:13.420163 543705 net.go:648] Add success.
I0319 10:57:13.429210 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 10:57:13.429283 543705 net.go:770] primary dev: ETH0
I0319 10:57:13.429295 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:57:13.429306 543705 net.go:698] Add success.
I0319 10:57:13.453031 543705 event_worker.go:152] Polling the log file for events...
I0319 10:57:13.468473 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0094a6e6-3c34-43e6-93dc-c63dc816a11b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 10:57:13.468504 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 10:57:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:57:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0319 10:57:14.455186 543705 disk_worker.go:728] disk inode is not compliant
E0319 10:57:14.456006 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 10:57:14.456014 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 10:57:14.456020 543705 custom_config.go:64] query custom config with name: gpu
I0319 10:57:14.456468 543705 disk_worker.go:494] system disk:vda1
I0319 10:57:14.456494 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 10:57:15.456810 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 10:57:15.456818 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:57:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 10:57:16.457991 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 10:57:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:57:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:57:16.472380 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:57:18.776581 543705 disk_info.go:125] begin check local disk info of client
I0319 10:57:18.778893 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:57:18.778899 543705 disk_info.go:196] parse disk info done, disk is : [0xc000331640 0xc000331680]
E0319 10:57:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:57:23.409799 543705 memory.go:184] no items to output this cycle
I0319 10:57:23.409809 543705 cpu.go:275] no items to output this cycle
E0319 10:57:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:57:33.409787 543705 cpu.go:275] no items to output this cycle
I0319 10:57:33.409796 543705 memory.go:184] no items to output this cycle
I0319 10:57:37.687292 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 10:57:37.687299 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 10:57:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:57:43.410538 543705 memory.go:191] Add success.
I0319 10:57:43.409787 543705 cpu.go:282] Add success.
I0319 10:57:43.420324 543705 net.go:648] Add success.
I0319 10:57:43.423068 543705 net.go:770] primary dev: ETH0
I0319 10:57:43.423081 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:57:43.423100 543705 net.go:698] Add success.
I0319 10:57:46.457667 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:57:46.457734 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:57:46.457759 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:57:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:57:53.409775 543705 memory.go:184] no items to output this cycle
I0319 10:57:53.409792 543705 cpu.go:275] no items to output this cycle
E0319 10:58:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:58:03.409786 543705 memory.go:184] no items to output this cycle
I0319 10:58:03.409790 543705 cpu.go:275] no items to output this cycle
W0319 10:58:13.409708 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:58:13.409724 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:58:13.409728 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0319 10:58:13.409803 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:58:13.409822 543705 memory.go:191] Add success.
I0319 10:58:13.409828 543705 cpu.go:282] Add success.
I0319 10:58:13.420080 543705 net.go:648] Add success.
I0319 10:58:13.423223 543705 net.go:770] primary dev: ETH0
I0319 10:58:13.423236 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:58:13.423258 543705 net.go:698] Add success.
I0319 10:58:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:58:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:58:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0319 10:58:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:58:14.456585 543705 disk_worker.go:494] system disk:vda1
I0319 10:58:14.456616 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:58:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:58:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:58:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:58:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:58:16.472370 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:58:18.780537 543705 disk_info.go:125] begin check local disk info of client
I0319 10:58:18.783132 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:58:18.783138 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5f00 0xc0000c5f40]
E0319 10:58:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:58:23.409773 543705 memory.go:184] no items to output this cycle
I0319 10:58:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 10:58:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:58:33.409811 543705 memory.go:184] no items to output this cycle
I0319 10:58:33.409825 543705 cpu.go:275] no items to output this cycle
E0319 10:58:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:58:43.409780 543705 memory.go:191] Add success.
I0319 10:58:43.409808 543705 cpu.go:282] Add success.
I0319 10:58:43.419738 543705 net.go:648] Add success.
I0319 10:58:43.422777 543705 net.go:770] primary dev: ETH0
I0319 10:58:43.422792 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:58:43.422805 543705 net.go:698] Add success.
I0319 10:58:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:58:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:58:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:58:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:58:53.409794 543705 memory.go:184] no items to output this cycle
I0319 10:58:53.409804 543705 cpu.go:275] no items to output this cycle
E0319 10:59:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:59:03.409788 543705 cpu.go:275] no items to output this cycle
I0319 10:59:03.409793 543705 memory.go:184] no items to output this cycle
E0319 10:59:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:59:13.409810 543705 memory.go:191] Add success.
I0319 10:59:13.409818 543705 cpu.go:282] Add success.
W0319 10:59:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 10:59:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 10:59:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 10:59:13.420070 543705 net.go:648] Add success.
I0319 10:59:13.422785 543705 net.go:770] primary dev: ETH0
I0319 10:59:13.422804 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:59:13.422819 543705 net.go:698] Add success.
I0319 10:59:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 10:59:14.455208 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 10:59:14.455218 543705 disk_worker.go:708] disk space is not compliant
W0319 10:59:14.455220 543705 disk_worker.go:728] disk inode is not compliant
I0319 10:59:14.456606 543705 disk_worker.go:494] system disk:vda1
I0319 10:59:14.456636 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 10:59:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 10:59:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:59:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:59:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 10:59:16.472485 543705 disk_local_worker.go:436] Get disk info: []
I0319 10:59:18.783218 543705 disk_info.go:125] begin check local disk info of client
I0319 10:59:18.785618 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 10:59:18.785624 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007ba40 0xc00007ba80]
E0319 10:59:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:59:23.409778 543705 cpu.go:275] no items to output this cycle
I0319 10:59:23.409780 543705 memory.go:184] no items to output this cycle
E0319 10:59:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:59:33.409808 543705 memory.go:184] no items to output this cycle
I0319 10:59:33.409822 543705 cpu.go:275] no items to output this cycle
E0319 10:59:43.409888 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:59:43.409916 543705 cpu.go:282] Add success.
I0319 10:59:43.409919 543705 memory.go:191] Add success.
I0319 10:59:43.419736 543705 net.go:648] Add success.
I0319 10:59:43.422405 543705 net.go:770] primary dev: ETH0
I0319 10:59:43.422420 543705 net.go:802] Send network stats successfully!,count is 6
I0319 10:59:43.422434 543705 net.go:698] Add success.
I0319 10:59:46.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 10:59:46.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 10:59:46.458056 543705 gpu_kunlun.go:227] Add success, len:1
E0319 10:59:53.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 10:59:53.409761 543705 memory.go:184] no items to output this cycle
I0319 10:59:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 11:00:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:00:03.409806 543705 memory.go:184] no items to output this cycle
I0319 11:00:03.409817 543705 cpu.go:275] no items to output this cycle
E0319 11:00:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:00:13.409788 543705 memory.go:191] Add success.
I0319 11:00:13.409805 543705 cpu.go:282] Add success.
W0319 11:00:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:00:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:00:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:00:13.420056 543705 net.go:648] Add success.
I0319 11:00:13.423241 543705 net.go:770] primary dev: ETH0
I0319 11:00:13.423256 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:00:13.423270 543705 net.go:698] Add success.
I0319 11:00:13.468122 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a2068d41-afed-4f9d-98ed-d5b3c6e71f7a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 11:00:13.468154 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 11:00:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:00:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:00:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0319 11:00:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:00:14.456586 543705 disk_worker.go:494] system disk:vda1
I0319 11:00:14.456617 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:00:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:00:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:00:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:00:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:00:16.472404 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:00:18.785674 543705 disk_info.go:125] begin check local disk info of client
I0319 11:00:18.788052 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:00:18.788057 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000f12c0 0xc0000f1300]
E0319 11:00:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:00:23.409775 543705 memory.go:184] no items to output this cycle
I0319 11:00:23.409790 543705 cpu.go:275] no items to output this cycle
E0319 11:00:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:00:33.409917 543705 cpu.go:275] no items to output this cycle
I0319 11:00:33.409921 543705 memory.go:184] no items to output this cycle
I0319 11:00:37.688136 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 11:00:37.688143 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 11:00:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:00:43.410746 543705 memory.go:191] Add success.
I0319 11:00:43.409807 543705 cpu.go:282] Add success.
I0319 11:00:43.420474 543705 net.go:648] Add success.
I0319 11:00:43.423500 543705 net.go:770] primary dev: ETH0
I0319 11:00:43.423514 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:00:43.423529 543705 net.go:698] Add success.
I0319 11:00:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:00:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:00:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:00:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:00:53.409794 543705 memory.go:184] no items to output this cycle
I0319 11:00:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 11:01:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:01:03.409770 543705 memory.go:184] no items to output this cycle
I0319 11:01:03.409794 543705 cpu.go:275] no items to output this cycle
E0319 11:01:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:01:13.409790 543705 memory.go:191] Add success.
I0319 11:01:13.409794 543705 cpu.go:282] Add success.
W0319 11:01:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:01:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:01:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:01:13.420037 543705 net.go:648] Add success.
I0319 11:01:13.422770 543705 net.go:770] primary dev: ETH0
I0319 11:01:13.422784 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:01:13.422796 543705 net.go:698] Add success.
I0319 11:01:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:01:14.455131 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:01:14.455213 543705 disk_worker.go:708] disk space is not compliant
W0319 11:01:14.455216 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:01:14.456591 543705 disk_worker.go:494] system disk:vda1
I0319 11:01:14.456619 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:01:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:01:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:01:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:01:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:01:16.472385 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:01:18.789578 543705 disk_info.go:125] begin check local disk info of client
I0319 11:01:18.791849 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:01:18.791856 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034fa00 0xc00034fa40]
E0319 11:01:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:01:23.409775 543705 memory.go:184] no items to output this cycle
I0319 11:01:23.409785 543705 cpu.go:275] no items to output this cycle
E0319 11:01:33.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:01:33.409818 543705 memory.go:184] no items to output this cycle
I0319 11:01:33.409831 543705 cpu.go:275] no items to output this cycle
E0319 11:01:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:01:43.409784 543705 memory.go:191] Add success.
I0319 11:01:43.409808 543705 cpu.go:282] Add success.
I0319 11:01:43.420022 543705 net.go:648] Add success.
I0319 11:01:43.422745 543705 net.go:770] primary dev: ETH0
I0319 11:01:43.422760 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:01:43.422773 543705 net.go:698] Add success.
I0319 11:01:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:01:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:01:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:01:53.410272 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:01:53.410289 543705 memory.go:184] no items to output this cycle
I0319 11:01:53.410293 543705 cpu.go:275] no items to output this cycle
E0319 11:02:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:02:03.409789 543705 memory.go:184] no items to output this cycle
I0319 11:02:03.409791 543705 cpu.go:275] no items to output this cycle
E0319 11:02:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:02:13.409785 543705 memory.go:191] Add success.
I0319 11:02:13.409801 543705 cpu.go:282] Add success.
W0319 11:02:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:02:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:02:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:02:13.420344 543705 net.go:648] Add success.
I0319 11:02:13.422920 543705 net.go:770] primary dev: ETH0
I0319 11:02:13.422935 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:02:13.422949 543705 net.go:698] Add success.
W0319 11:02:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:02:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0319 11:02:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:02:14.456797 543705 disk_worker.go:494] system disk:vda1
I0319 11:02:14.456836 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 11:02:14.457116 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 11:02:14.457124 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 11:02:14.457128 543705 custom_config.go:64] query custom config with name: gpu
E0319 11:02:15.456829 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 11:02:15.456838 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:02:16.457913 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 11:02:16.457913 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 11:02:16.457969 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:02:16.457989 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:02:16.472318 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:02:18.791940 543705 disk_info.go:125] begin check local disk info of client
I0319 11:02:18.794416 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:02:18.794424 543705 disk_info.go:196] parse disk info done, disk is : [0xc000487680 0xc0004876c0]
E0319 11:02:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:02:23.409764 543705 memory.go:184] no items to output this cycle
I0319 11:02:23.409797 543705 cpu.go:275] no items to output this cycle
E0319 11:02:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:02:33.409791 543705 memory.go:184] no items to output this cycle
I0319 11:02:33.409797 543705 cpu.go:275] no items to output this cycle
E0319 11:02:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:02:43.409781 543705 memory.go:191] Add success.
I0319 11:02:43.409802 543705 cpu.go:282] Add success.
I0319 11:02:43.419872 543705 net.go:648] Add success.
I0319 11:02:43.422531 543705 net.go:770] primary dev: ETH0
I0319 11:02:43.422543 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:02:43.422555 543705 net.go:698] Add success.
I0319 11:02:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:02:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:02:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:02:53.410217 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:02:53.410238 543705 memory.go:184] no items to output this cycle
I0319 11:02:53.410248 543705 cpu.go:275] no items to output this cycle
E0319 11:03:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:03:03.409798 543705 memory.go:184] no items to output this cycle
I0319 11:03:03.409805 543705 cpu.go:275] no items to output this cycle
E0319 11:03:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:03:13.409799 543705 memory.go:191] Add success.
I0319 11:03:13.409803 543705 cpu.go:282] Add success.
W0319 11:03:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:03:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:03:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:03:13.420194 543705 net.go:648] Add success.
I0319 11:03:13.422838 543705 net.go:770] primary dev: ETH0
I0319 11:03:13.422851 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:03:13.422863 543705 net.go:698] Add success.
I0319 11:03:13.469380 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3285e59e-e48d-4aaf-bfcc-db30d58822dc","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 11:03:13.469414 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 11:03:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:03:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:03:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0319 11:03:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:03:14.456580 543705 disk_worker.go:494] system disk:vda1
I0319 11:03:14.456610 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:03:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:03:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:03:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:03:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:03:16.472430 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:03:18.794515 543705 disk_info.go:125] begin check local disk info of client
I0319 11:03:18.796958 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:03:18.796965 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035c000 0xc00035c040]
E0319 11:03:23.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:03:23.409806 543705 memory.go:184] no items to output this cycle
I0319 11:03:23.409819 543705 cpu.go:275] no items to output this cycle
E0319 11:03:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:03:33.409786 543705 memory.go:184] no items to output this cycle
I0319 11:03:33.409793 543705 cpu.go:275] no items to output this cycle
I0319 11:03:37.688279 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 11:03:37.688286 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 11:03:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:03:43.410676 543705 memory.go:191] Add success.
I0319 11:03:43.409815 543705 cpu.go:282] Add success.
I0319 11:03:43.420450 543705 net.go:648] Add success.
I0319 11:03:43.423372 543705 net.go:770] primary dev: ETH0
I0319 11:03:43.423386 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:03:43.423398 543705 net.go:698] Add success.
I0319 11:03:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:03:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:03:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:03:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:03:53.409789 543705 cpu.go:275] no items to output this cycle
I0319 11:03:53.409791 543705 memory.go:184] no items to output this cycle
E0319 11:04:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:04:03.409789 543705 memory.go:184] no items to output this cycle
I0319 11:04:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 11:04:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:04:13.409815 543705 memory.go:191] Add success.
I0319 11:04:13.409824 543705 cpu.go:282] Add success.
W0319 11:04:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:04:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:04:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:04:13.420129 543705 net.go:648] Add success.
I0319 11:04:13.423061 543705 net.go:770] primary dev: ETH0
I0319 11:04:13.423076 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:04:13.423090 543705 net.go:698] Add success.
I0319 11:04:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:04:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:04:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0319 11:04:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:04:14.456567 543705 disk_worker.go:494] system disk:vda1
I0319 11:04:14.456598 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:04:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:04:16.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:04:16.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:04:16.458084 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:04:16.472091 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:04:18.797676 543705 disk_info.go:125] begin check local disk info of client
I0319 11:04:18.800146 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:04:18.800151 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048a380 0xc00048a3c0]
E0319 11:04:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:04:23.409782 543705 memory.go:184] no items to output this cycle
I0319 11:04:23.409795 543705 cpu.go:275] no items to output this cycle
E0319 11:04:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:04:33.409783 543705 memory.go:184] no items to output this cycle
I0319 11:04:33.409793 543705 cpu.go:275] no items to output this cycle
E0319 11:04:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:04:43.409780 543705 memory.go:191] Add success.
I0319 11:04:43.409800 543705 cpu.go:282] Add success.
I0319 11:04:43.419960 543705 net.go:648] Add success.
I0319 11:04:43.422962 543705 net.go:770] primary dev: ETH0
I0319 11:04:43.422975 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:04:43.422990 543705 net.go:698] Add success.
I0319 11:04:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:04:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:04:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:04:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:04:53.409767 543705 memory.go:184] no items to output this cycle
I0319 11:04:53.409798 543705 cpu.go:275] no items to output this cycle
E0319 11:05:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:05:03.409811 543705 memory.go:184] no items to output this cycle
I0319 11:05:03.409823 543705 cpu.go:275] no items to output this cycle
E0319 11:05:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:05:13.409790 543705 memory.go:191] Add success.
I0319 11:05:13.409797 543705 cpu.go:282] Add success.
W0319 11:05:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:05:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:05:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:05:13.420201 543705 net.go:648] Add success.
I0319 11:05:13.423248 543705 net.go:770] primary dev: ETH0
I0319 11:05:13.423261 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:05:13.423273 543705 net.go:698] Add success.
I0319 11:05:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:05:14.455147 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:05:14.455156 543705 disk_worker.go:708] disk space is not compliant
W0319 11:05:14.455159 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:05:14.456477 543705 disk_worker.go:494] system disk:vda1
I0319 11:05:14.456521 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:05:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:05:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:05:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:05:16.458080 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:05:16.472477 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:05:18.800234 543705 disk_info.go:125] begin check local disk info of client
I0319 11:05:18.802715 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:05:18.802722 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a86c0 0xc0004a8700]
E0319 11:05:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:05:23.409799 543705 memory.go:184] no items to output this cycle
I0319 11:05:23.409813 543705 cpu.go:275] no items to output this cycle
E0319 11:05:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:05:33.409782 543705 memory.go:184] no items to output this cycle
I0319 11:05:33.409791 543705 cpu.go:275] no items to output this cycle
E0319 11:05:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:05:43.409792 543705 memory.go:191] Add success.
I0319 11:05:43.409796 543705 cpu.go:282] Add success.
I0319 11:05:43.419967 543705 net.go:648] Add success.
I0319 11:05:43.423038 543705 net.go:770] primary dev: ETH0
I0319 11:05:43.423050 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:05:43.423062 543705 net.go:698] Add success.
I0319 11:05:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:05:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:05:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:05:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:05:53.409766 543705 memory.go:184] no items to output this cycle
I0319 11:05:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 11:06:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:06:03.409785 543705 memory.go:184] no items to output this cycle
I0319 11:06:03.409786 543705 cpu.go:275] no items to output this cycle
E0319 11:06:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:06:13.409792 543705 memory.go:191] Add success.
I0319 11:06:13.409794 543705 cpu.go:282] Add success.
W0319 11:06:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:06:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:06:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:06:13.420070 543705 net.go:648] Add success.
I0319 11:06:13.422652 543705 net.go:770] primary dev: ETH0
I0319 11:06:13.422667 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:06:13.422681 543705 net.go:698] Add success.
I0319 11:06:13.468454 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9c08612f-1af6-4b1b-90dc-586544af2d54","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 11:06:13.468488 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 11:06:14.454982 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:06:14.455262 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:06:14.455355 543705 disk_worker.go:708] disk space is not compliant
W0319 11:06:14.455359 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:06:14.457477 543705 disk_worker.go:494] system disk:vda1
I0319 11:06:14.457525 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:06:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:06:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:06:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:06:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:06:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:06:18.803734 543705 disk_info.go:125] begin check local disk info of client
I0319 11:06:18.806141 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:06:18.806147 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004d65c0 0xc0004d6600]
E0319 11:06:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:06:23.409777 543705 memory.go:184] no items to output this cycle
I0319 11:06:23.409780 543705 cpu.go:275] no items to output this cycle
E0319 11:06:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:06:33.409795 543705 memory.go:184] no items to output this cycle
I0319 11:06:33.409802 543705 cpu.go:275] no items to output this cycle
I0319 11:06:37.689131 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 11:06:37.689138 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 11:06:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:06:43.410742 543705 memory.go:191] Add success.
I0319 11:06:43.409828 543705 cpu.go:282] Add success.
I0319 11:06:43.420584 543705 net.go:648] Add success.
I0319 11:06:43.423714 543705 net.go:770] primary dev: ETH0
I0319 11:06:43.423727 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:06:43.423740 543705 net.go:698] Add success.
I0319 11:06:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:06:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:06:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:06:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:06:53.409802 543705 cpu.go:275] no items to output this cycle
I0319 11:06:53.409805 543705 memory.go:184] no items to output this cycle
E0319 11:07:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:07:03.409814 543705 memory.go:184] no items to output this cycle
I0319 11:07:03.409827 543705 cpu.go:275] no items to output this cycle
E0319 11:07:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:07:13.409795 543705 memory.go:191] Add success.
I0319 11:07:13.409823 543705 cpu.go:282] Add success.
W0319 11:07:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:07:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:07:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:07:13.420071 543705 net.go:648] Add success.
I0319 11:07:13.423123 543705 net.go:770] primary dev: ETH0
I0319 11:07:13.423138 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:07:13.423151 543705 net.go:698] Add success.
I0319 11:07:13.453668 543705 event_worker.go:152] Polling the log file for events...
W0319 11:07:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:07:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0319 11:07:14.455175 543705 disk_worker.go:728] disk inode is not compliant
E0319 11:07:14.456158 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 11:07:14.456168 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 11:07:14.456174 543705 custom_config.go:64] query custom config with name: gpu
I0319 11:07:14.456449 543705 disk_worker.go:494] system disk:vda1
I0319 11:07:14.456506 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 11:07:15.456794 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 11:07:15.456803 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:07:16.457942 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 11:07:16.457945 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 11:07:16.457999 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:07:16.458020 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:07:16.472342 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:07:18.807669 543705 disk_info.go:125] begin check local disk info of client
I0319 11:07:18.810041 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:07:18.810047 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039c140 0xc00039c180]
E0319 11:07:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:07:23.409786 543705 memory.go:184] no items to output this cycle
I0319 11:07:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 11:07:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:07:33.409794 543705 memory.go:184] no items to output this cycle
I0319 11:07:33.409809 543705 cpu.go:275] no items to output this cycle
E0319 11:07:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:07:43.409794 543705 memory.go:191] Add success.
I0319 11:07:43.409817 543705 cpu.go:282] Add success.
I0319 11:07:43.419902 543705 net.go:648] Add success.
I0319 11:07:43.422962 543705 net.go:770] primary dev: ETH0
I0319 11:07:43.422976 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:07:43.422988 543705 net.go:698] Add success.
I0319 11:07:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:07:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:07:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:07:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:07:53.409778 543705 memory.go:184] no items to output this cycle
I0319 11:07:53.409788 543705 cpu.go:275] no items to output this cycle
E0319 11:08:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:08:03.409782 543705 memory.go:184] no items to output this cycle
I0319 11:08:03.409801 543705 cpu.go:275] no items to output this cycle
E0319 11:08:13.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:08:13.409830 543705 memory.go:191] Add success.
I0319 11:08:13.409851 543705 cpu.go:282] Add success.
W0319 11:08:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:08:13.409880 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:08:13.409884 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:08:13.420550 543705 net.go:648] Add success.
I0319 11:08:13.423540 543705 net.go:770] primary dev: ETH0
I0319 11:08:13.423555 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:08:13.423571 543705 net.go:698] Add success.
I0319 11:08:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:08:14.455128 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:08:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0319 11:08:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:08:14.456592 543705 disk_worker.go:494] system disk:vda1
I0319 11:08:14.456623 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:08:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:08:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:08:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:08:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:08:16.472385 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:08:18.811752 543705 disk_info.go:125] begin check local disk info of client
I0319 11:08:18.814163 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:08:18.814169 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039a0c0 0xc00039a100]
E0319 11:08:23.410241 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:08:23.410256 543705 memory.go:184] no items to output this cycle
I0319 11:08:23.410267 543705 cpu.go:275] no items to output this cycle
E0319 11:08:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:08:33.409780 543705 memory.go:184] no items to output this cycle
I0319 11:08:33.409792 543705 cpu.go:275] no items to output this cycle
E0319 11:08:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:08:43.409823 543705 memory.go:191] Add success.
I0319 11:08:43.409825 543705 cpu.go:282] Add success.
I0319 11:08:43.420027 543705 net.go:648] Add success.
I0319 11:08:43.422853 543705 net.go:770] primary dev: ETH0
I0319 11:08:43.422867 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:08:43.422879 543705 net.go:698] Add success.
I0319 11:08:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:08:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:08:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:08:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:08:53.409794 543705 memory.go:184] no items to output this cycle
I0319 11:08:53.409806 543705 cpu.go:275] no items to output this cycle
E0319 11:09:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:09:03.409779 543705 memory.go:184] no items to output this cycle
I0319 11:09:03.409792 543705 cpu.go:275] no items to output this cycle
E0319 11:09:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:09:13.409789 543705 memory.go:191] Add success.
I0319 11:09:13.409808 543705 cpu.go:282] Add success.
W0319 11:09:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:09:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:09:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:09:13.420163 543705 net.go:648] Add success.
I0319 11:09:13.422959 543705 net.go:770] primary dev: ETH0
I0319 11:09:13.422973 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:09:13.422998 543705 net.go:698] Add success.
I0319 11:09:13.470471 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9f623839-1cc3-48bd-bf1a-14c090ca4b5c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 11:09:13.470503 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 11:09:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:09:14.455201 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:09:14.455211 543705 disk_worker.go:708] disk space is not compliant
W0319 11:09:14.455214 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:09:14.456599 543705 disk_worker.go:494] system disk:vda1
I0319 11:09:14.456631 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:09:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:09:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:09:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:09:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:09:16.472363 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:09:18.815778 543705 disk_info.go:125] begin check local disk info of client
I0319 11:09:18.818187 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:09:18.818193 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa100 0xc0001aa140]
E0319 11:09:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:09:23.409773 543705 memory.go:184] no items to output this cycle
I0319 11:09:23.409777 543705 cpu.go:275] no items to output this cycle
E0319 11:09:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:09:33.409782 543705 memory.go:184] no items to output this cycle
I0319 11:09:33.409791 543705 cpu.go:275] no items to output this cycle
I0319 11:09:37.689728 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 11:09:37.689734 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 11:09:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:09:43.410822 543705 memory.go:191] Add success.
I0319 11:09:43.409817 543705 cpu.go:282] Add success.
I0319 11:09:43.420615 543705 net.go:648] Add success.
I0319 11:09:43.423335 543705 net.go:770] primary dev: ETH0
I0319 11:09:43.423353 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:09:43.423369 543705 net.go:698] Add success.
I0319 11:09:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:09:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:09:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:09:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:09:53.409766 543705 memory.go:184] no items to output this cycle
I0319 11:09:53.409787 543705 cpu.go:275] no items to output this cycle
E0319 11:10:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:10:03.409807 543705 memory.go:184] no items to output this cycle
I0319 11:10:03.409825 543705 cpu.go:275] no items to output this cycle
E0319 11:10:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:10:13.409787 543705 memory.go:191] Add success.
I0319 11:10:13.409808 543705 cpu.go:282] Add success.
W0319 11:10:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:10:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:10:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:10:13.420115 543705 net.go:648] Add success.
I0319 11:10:13.423052 543705 net.go:770] primary dev: ETH0
I0319 11:10:13.423065 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:10:13.423086 543705 net.go:698] Add success.
I0319 11:10:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:10:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:10:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 11:10:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:10:14.456584 543705 disk_worker.go:494] system disk:vda1
I0319 11:10:14.456613 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:10:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:10:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:10:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:10:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:10:16.472431 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:10:18.819730 543705 disk_info.go:125] begin check local disk info of client
I0319 11:10:18.822110 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:10:18.822116 543705 disk_info.go:196] parse disk info done, disk is : [0xc000344080 0xc0003440c0]
E0319 11:10:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:10:23.409778 543705 memory.go:184] no items to output this cycle
I0319 11:10:23.409783 543705 cpu.go:275] no items to output this cycle
E0319 11:10:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:10:33.409777 543705 memory.go:184] no items to output this cycle
I0319 11:10:33.409804 543705 cpu.go:275] no items to output this cycle
E0319 11:10:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:10:43.409813 543705 memory.go:191] Add success.
I0319 11:10:43.409820 543705 cpu.go:282] Add success.
I0319 11:10:43.419988 543705 net.go:648] Add success.
I0319 11:10:43.423006 543705 net.go:770] primary dev: ETH0
I0319 11:10:43.423021 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:10:43.423032 543705 net.go:698] Add success.
I0319 11:10:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:10:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:10:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:10:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:10:53.409799 543705 memory.go:184] no items to output this cycle
I0319 11:10:53.409809 543705 cpu.go:275] no items to output this cycle
E0319 11:11:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:11:03.409804 543705 memory.go:184] no items to output this cycle
I0319 11:11:03.409818 543705 cpu.go:275] no items to output this cycle
E0319 11:11:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:11:13.409813 543705 memory.go:191] Add success.
I0319 11:11:13.409815 543705 cpu.go:282] Add success.
W0319 11:11:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:11:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:11:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:11:13.420143 543705 net.go:648] Add success.
I0319 11:11:13.422838 543705 net.go:770] primary dev: ETH0
I0319 11:11:13.422853 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:11:13.422865 543705 net.go:698] Add success.
I0319 11:11:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:11:14.455198 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:11:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0319 11:11:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:11:14.456602 543705 disk_worker.go:494] system disk:vda1
I0319 11:11:14.456632 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:11:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:11:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:11:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:11:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:11:16.472385 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:11:18.822190 543705 disk_info.go:125] begin check local disk info of client
I0319 11:11:18.824614 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:11:18.824621 543705 disk_info.go:196] parse disk info done, disk is : [0xc000466980 0xc0004669c0]
E0319 11:11:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:11:23.409766 543705 memory.go:184] no items to output this cycle
I0319 11:11:23.409788 543705 cpu.go:275] no items to output this cycle
E0319 11:11:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:11:33.409791 543705 memory.go:184] no items to output this cycle
I0319 11:11:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 11:11:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:11:43.409787 543705 memory.go:191] Add success.
I0319 11:11:43.409788 543705 cpu.go:282] Add success.
I0319 11:11:43.419972 543705 net.go:648] Add success.
I0319 11:11:43.423232 543705 net.go:770] primary dev: ETH0
I0319 11:11:43.423245 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:11:43.423258 543705 net.go:698] Add success.
I0319 11:11:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:11:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:11:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:11:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:11:53.409807 543705 memory.go:184] no items to output this cycle
I0319 11:11:53.409814 543705 cpu.go:275] no items to output this cycle
E0319 11:12:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:12:03.409781 543705 memory.go:184] no items to output this cycle
I0319 11:12:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 11:12:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:12:13.409818 543705 memory.go:191] Add success.
I0319 11:12:13.409822 543705 cpu.go:282] Add success.
W0319 11:12:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:12:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:12:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:12:13.420140 543705 net.go:648] Add success.
I0319 11:12:13.422862 543705 net.go:770] primary dev: ETH0
I0319 11:12:13.422876 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:12:13.422888 543705 net.go:698] Add success.
I0319 11:12:13.464467 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"16f672e7-153f-4ae6-9aa2-bfc65b266d66","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 11:12:13.464503 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 11:12:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:12:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0319 11:12:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:12:14.456830 543705 disk_worker.go:494] system disk:vda1
I0319 11:12:14.456872 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 11:12:14.457076 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 11:12:14.457084 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 11:12:14.457088 543705 custom_config.go:64] query custom config with name: gpu
E0319 11:12:15.456887 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 11:12:15.456895 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:12:16.457920 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 11:12:16.457920 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 11:12:16.457977 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:12:16.457998 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:12:16.472334 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:12:18.825673 543705 disk_info.go:125] begin check local disk info of client
I0319 11:12:18.827966 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:12:18.827972 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000ea0c0 0xc0000ea100]
E0319 11:12:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:12:23.409768 543705 memory.go:184] no items to output this cycle
I0319 11:12:23.409789 543705 cpu.go:275] no items to output this cycle
E0319 11:12:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:12:33.409777 543705 memory.go:184] no items to output this cycle
I0319 11:12:33.409801 543705 cpu.go:275] no items to output this cycle
I0319 11:12:37.691145 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 11:12:37.691152 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 11:12:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:12:43.410613 543705 memory.go:191] Add success.
I0319 11:12:43.409797 543705 cpu.go:282] Add success.
I0319 11:12:43.420419 543705 net.go:648] Add success.
I0319 11:12:43.423496 543705 net.go:770] primary dev: ETH0
I0319 11:12:43.423509 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:12:43.423521 543705 net.go:698] Add success.
I0319 11:12:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:12:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:12:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:12:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:12:53.409782 543705 memory.go:184] no items to output this cycle
I0319 11:12:53.409784 543705 cpu.go:275] no items to output this cycle
E0319 11:13:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:13:03.409784 543705 memory.go:184] no items to output this cycle
I0319 11:13:03.409792 543705 cpu.go:275] no items to output this cycle
E0319 11:13:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:13:13.409819 543705 memory.go:191] Add success.
I0319 11:13:13.409820 543705 cpu.go:282] Add success.
W0319 11:13:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:13:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:13:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:13:13.420160 543705 net.go:648] Add success.
I0319 11:13:13.423193 543705 net.go:770] primary dev: ETH0
I0319 11:13:13.423206 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:13:13.423218 543705 net.go:698] Add success.
I0319 11:13:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:13:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:13:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0319 11:13:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:13:14.456521 543705 disk_worker.go:494] system disk:vda1
I0319 11:13:14.456568 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:13:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:13:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:13:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:13:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:13:16.472409 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:13:18.829672 543705 disk_info.go:125] begin check local disk info of client
I0319 11:13:18.831993 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:13:18.831999 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048c100 0xc00048c140]
E0319 11:13:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:13:23.409767 543705 memory.go:184] no items to output this cycle
I0319 11:13:23.409793 543705 cpu.go:275] no items to output this cycle
E0319 11:13:33.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:13:33.409812 543705 memory.go:184] no items to output this cycle
I0319 11:13:33.409826 543705 cpu.go:275] no items to output this cycle
E0319 11:13:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:13:43.409813 543705 memory.go:191] Add success.
I0319 11:13:43.409819 543705 cpu.go:282] Add success.
I0319 11:13:43.419984 543705 net.go:648] Add success.
I0319 11:13:43.423024 543705 net.go:770] primary dev: ETH0
I0319 11:13:43.423037 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:13:43.423049 543705 net.go:698] Add success.
I0319 11:13:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:13:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:13:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:13:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:13:53.409770 543705 memory.go:184] no items to output this cycle
I0319 11:13:53.409785 543705 cpu.go:275] no items to output this cycle
E0319 11:14:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:14:03.409810 543705 memory.go:184] no items to output this cycle
I0319 11:14:03.409821 543705 cpu.go:275] no items to output this cycle
E0319 11:14:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:14:13.409817 543705 memory.go:191] Add success.
I0319 11:14:13.409823 543705 cpu.go:282] Add success.
W0319 11:14:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:14:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:14:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:14:13.420100 543705 net.go:648] Add success.
I0319 11:14:13.423092 543705 net.go:770] primary dev: ETH0
I0319 11:14:13.423105 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:14:13.423116 543705 net.go:698] Add success.
I0319 11:14:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:14:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:14:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0319 11:14:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:14:14.456584 543705 disk_worker.go:494] system disk:vda1
I0319 11:14:14.456614 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:14:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:14:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:14:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:14:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:14:16.472397 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:14:18.833673 543705 disk_info.go:125] begin check local disk info of client
I0319 11:14:18.836047 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:14:18.836053 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048a100 0xc00048a140]
E0319 11:14:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:14:23.409788 543705 memory.go:184] no items to output this cycle
I0319 11:14:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 11:14:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:14:33.409783 543705 memory.go:184] no items to output this cycle
I0319 11:14:33.409786 543705 cpu.go:275] no items to output this cycle
E0319 11:14:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:14:43.409812 543705 memory.go:191] Add success.
I0319 11:14:43.409825 543705 cpu.go:282] Add success.
I0319 11:14:43.419993 543705 net.go:648] Add success.
I0319 11:14:43.422949 543705 net.go:770] primary dev: ETH0
I0319 11:14:43.422966 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:14:43.422979 543705 net.go:698] Add success.
I0319 11:14:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:14:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:14:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:14:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:14:53.409776 543705 cpu.go:275] no items to output this cycle
I0319 11:14:53.409779 543705 memory.go:184] no items to output this cycle
E0319 11:15:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:15:03.409810 543705 memory.go:184] no items to output this cycle
I0319 11:15:03.409820 543705 cpu.go:275] no items to output this cycle
E0319 11:15:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:15:13.409791 543705 memory.go:191] Add success.
I0319 11:15:13.409792 543705 cpu.go:282] Add success.
W0319 11:15:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:15:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:15:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:15:13.420131 543705 net.go:648] Add success.
I0319 11:15:13.422699 543705 net.go:770] primary dev: ETH0
I0319 11:15:13.422713 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:15:13.422725 543705 net.go:698] Add success.
I0319 11:15:13.469125 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"44be6a60-5929-4f21-8e12-cf90924580ff","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 11:15:13.469159 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 11:15:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:15:14.455124 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:15:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0319 11:15:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:15:14.456630 543705 disk_worker.go:494] system disk:vda1
I0319 11:15:14.456661 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:15:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:15:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:15:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:15:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:15:16.472381 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:15:18.837674 543705 disk_info.go:125] begin check local disk info of client
I0319 11:15:18.840041 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:15:18.840047 543705 disk_info.go:196] parse disk info done, disk is : [0xc000296000 0xc000296040]
E0319 11:15:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:15:23.409760 543705 memory.go:184] no items to output this cycle
I0319 11:15:23.409792 543705 cpu.go:275] no items to output this cycle
E0319 11:15:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:15:33.409768 543705 memory.go:184] no items to output this cycle
I0319 11:15:33.409803 543705 cpu.go:275] no items to output this cycle
I0319 11:15:37.692144 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 11:15:37.692151 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 11:15:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:15:43.410635 543705 memory.go:191] Add success.
I0319 11:15:43.409787 543705 cpu.go:282] Add success.
I0319 11:15:43.420430 543705 net.go:648] Add success.
I0319 11:15:43.422948 543705 net.go:770] primary dev: ETH0
I0319 11:15:43.422964 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:15:43.422978 543705 net.go:698] Add success.
I0319 11:15:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:15:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:15:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:15:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:15:53.409798 543705 memory.go:184] no items to output this cycle
I0319 11:15:53.409809 543705 cpu.go:275] no items to output this cycle
E0319 11:16:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:16:03.409783 543705 cpu.go:275] no items to output this cycle
I0319 11:16:03.409790 543705 memory.go:184] no items to output this cycle
E0319 11:16:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:16:13.409795 543705 memory.go:191] Add success.
I0319 11:16:13.409799 543705 cpu.go:282] Add success.
W0319 11:16:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:16:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:16:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:16:13.420222 543705 net.go:648] Add success.
I0319 11:16:13.423228 543705 net.go:770] primary dev: ETH0
I0319 11:16:13.423241 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:16:13.423255 543705 net.go:698] Add success.
I0319 11:16:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:16:14.455119 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:16:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0319 11:16:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:16:14.459221 543705 disk_worker.go:494] system disk:vda1
I0319 11:16:14.459253 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:16:15.456010 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:16:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:16:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:16:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:16:16.472383 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:16:18.841673 543705 disk_info.go:125] begin check local disk info of client
I0319 11:16:18.844128 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:16:18.844134 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ee100 0xc0003ee140]
E0319 11:16:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:16:23.409774 543705 cpu.go:275] no items to output this cycle
I0319 11:16:23.409777 543705 memory.go:184] no items to output this cycle
E0319 11:16:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:16:33.409779 543705 memory.go:184] no items to output this cycle
I0319 11:16:33.409796 543705 cpu.go:275] no items to output this cycle
E0319 11:16:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:16:43.409783 543705 memory.go:191] Add success.
I0319 11:16:43.409806 543705 cpu.go:282] Add success.
I0319 11:16:43.419840 543705 net.go:648] Add success.
I0319 11:16:43.422574 543705 net.go:770] primary dev: ETH0
I0319 11:16:43.422589 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:16:43.422601 543705 net.go:698] Add success.
I0319 11:16:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:16:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:16:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:16:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:16:53.409783 543705 memory.go:184] no items to output this cycle
I0319 11:16:53.409782 543705 cpu.go:275] no items to output this cycle
E0319 11:17:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:17:03.409786 543705 memory.go:184] no items to output this cycle
I0319 11:17:03.409807 543705 cpu.go:275] no items to output this cycle
E0319 11:17:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:17:13.409825 543705 memory.go:191] Add success.
I0319 11:17:13.409830 543705 cpu.go:282] Add success.
W0319 11:17:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:17:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:17:13.409878 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:17:13.420209 543705 net.go:648] Add success.
I0319 11:17:13.422819 543705 net.go:770] primary dev: ETH0
I0319 11:17:13.422832 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:17:13.422844 543705 net.go:698] Add success.
I0319 11:17:13.453375 543705 event_worker.go:152] Polling the log file for events...
W0319 11:17:14.455116 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:17:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0319 11:17:14.455181 543705 disk_worker.go:728] disk inode is not compliant
E0319 11:17:14.455881 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 11:17:14.455890 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 11:17:14.455896 543705 custom_config.go:64] query custom config with name: gpu
I0319 11:17:14.456786 543705 disk_worker.go:494] system disk:vda1
I0319 11:17:14.456881 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 11:17:15.456857 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 11:17:15.456867 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:17:16.457919 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 11:17:16.457929 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 11:17:16.457973 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:17:16.457989 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:17:16.472327 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:17:18.845670 543705 disk_info.go:125] begin check local disk info of client
I0319 11:17:18.848035 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:17:18.848041 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003eefc0 0xc0003ef000]
E0319 11:17:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:17:23.409780 543705 memory.go:184] no items to output this cycle
I0319 11:17:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 11:17:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:17:33.409812 543705 memory.go:184] no items to output this cycle
I0319 11:17:33.409823 543705 cpu.go:275] no items to output this cycle
E0319 11:17:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:17:43.409790 543705 memory.go:191] Add success.
I0319 11:17:43.409810 543705 cpu.go:282] Add success.
I0319 11:17:43.419883 543705 net.go:648] Add success.
I0319 11:17:43.422645 543705 net.go:770] primary dev: ETH0
I0319 11:17:43.422660 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:17:43.422674 543705 net.go:698] Add success.
I0319 11:17:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:17:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:17:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:17:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:17:53.409786 543705 memory.go:184] no items to output this cycle
I0319 11:17:53.409792 543705 cpu.go:275] no items to output this cycle
E0319 11:18:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:18:03.409782 543705 memory.go:184] no items to output this cycle
I0319 11:18:03.409819 543705 cpu.go:275] no items to output this cycle
E0319 11:18:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:18:13.409789 543705 memory.go:191] Add success.
I0319 11:18:13.409807 543705 cpu.go:282] Add success.
W0319 11:18:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:18:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:18:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:18:13.420355 543705 net.go:648] Add success.
I0319 11:18:13.423370 543705 net.go:770] primary dev: ETH0
I0319 11:18:13.423383 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:18:13.423394 543705 net.go:698] Add success.
I0319 11:18:13.469490 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"abbd50c3-cdc1-4ebb-b699-df662eea4c0d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 11:18:13.469524 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 11:18:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:18:14.455304 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:18:14.455367 543705 disk_worker.go:708] disk space is not compliant
W0319 11:18:14.455370 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:18:14.457552 543705 disk_worker.go:494] system disk:vda1
I0319 11:18:14.457580 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:18:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:18:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:18:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:18:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:18:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:18:18.849674 543705 disk_info.go:125] begin check local disk info of client
I0319 11:18:18.852046 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:18:18.852052 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004dd940 0xc0004dd980]
E0319 11:18:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:18:23.409793 543705 memory.go:184] no items to output this cycle
I0319 11:18:23.409806 543705 cpu.go:275] no items to output this cycle
E0319 11:18:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:18:33.409785 543705 memory.go:184] no items to output this cycle
I0319 11:18:33.409804 543705 cpu.go:275] no items to output this cycle
I0319 11:18:37.692289 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 11:18:37.692296 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 11:18:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:18:43.410755 543705 memory.go:191] Add success.
I0319 11:18:43.409806 543705 cpu.go:282] Add success.
I0319 11:18:43.420430 543705 net.go:648] Add success.
I0319 11:18:43.423180 543705 net.go:770] primary dev: ETH0
I0319 11:18:43.423193 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:18:43.423206 543705 net.go:698] Add success.
I0319 11:18:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:18:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:18:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:18:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:18:53.409781 543705 memory.go:184] no items to output this cycle
I0319 11:18:53.409800 543705 cpu.go:275] no items to output this cycle
E0319 11:19:03.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:19:03.409814 543705 memory.go:184] no items to output this cycle
I0319 11:19:03.409827 543705 cpu.go:275] no items to output this cycle
E0319 11:19:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:19:13.409803 543705 memory.go:191] Add success.
I0319 11:19:13.409807 543705 cpu.go:282] Add success.
W0319 11:19:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:19:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:19:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:19:13.420061 543705 net.go:648] Add success.
I0319 11:19:13.422910 543705 net.go:770] primary dev: ETH0
I0319 11:19:13.422927 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:19:13.422942 543705 net.go:698] Add success.
I0319 11:19:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:19:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:19:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0319 11:19:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:19:14.456557 543705 disk_worker.go:494] system disk:vda1
I0319 11:19:14.456723 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:19:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:19:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:19:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:19:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:19:16.472381 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:19:18.853672 543705 disk_info.go:125] begin check local disk info of client
I0319 11:19:18.856023 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:19:18.856029 543705 disk_info.go:196] parse disk info done, disk is : [0xc000484600 0xc000484640]
E0319 11:19:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:19:23.409784 543705 memory.go:184] no items to output this cycle
I0319 11:19:23.409789 543705 cpu.go:275] no items to output this cycle
E0319 11:19:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:19:33.409796 543705 memory.go:184] no items to output this cycle
I0319 11:19:33.409809 543705 cpu.go:275] no items to output this cycle
E0319 11:19:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:19:43.409799 543705 memory.go:191] Add success.
I0319 11:19:43.409820 543705 cpu.go:282] Add success.
I0319 11:19:43.419958 543705 net.go:648] Add success.
I0319 11:19:43.422831 543705 net.go:770] primary dev: ETH0
I0319 11:19:43.422844 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:19:43.422856 543705 net.go:698] Add success.
I0319 11:19:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:19:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:19:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:19:53.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:19:53.409810 543705 memory.go:184] no items to output this cycle
I0319 11:19:53.409825 543705 cpu.go:275] no items to output this cycle
E0319 11:20:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:20:03.409788 543705 memory.go:184] no items to output this cycle
I0319 11:20:03.409794 543705 cpu.go:275] no items to output this cycle
E0319 11:20:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:20:13.409783 543705 memory.go:191] Add success.
I0319 11:20:13.409809 543705 cpu.go:282] Add success.
W0319 11:20:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:20:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:20:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:20:13.420060 543705 net.go:648] Add success.
I0319 11:20:13.423051 543705 net.go:770] primary dev: ETH0
I0319 11:20:13.423066 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:20:13.423081 543705 net.go:698] Add success.
I0319 11:20:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:20:14.455130 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:20:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 11:20:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:20:14.456565 543705 disk_worker.go:494] system disk:vda1
I0319 11:20:14.456597 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:20:15.456022 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:20:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:20:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:20:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:20:16.472443 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:20:18.857671 543705 disk_info.go:125] begin check local disk info of client
I0319 11:20:18.860005 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:20:18.860012 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003dc340 0xc0003dc380]
E0319 11:20:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:20:23.409765 543705 memory.go:184] no items to output this cycle
I0319 11:20:23.409796 543705 cpu.go:275] no items to output this cycle
E0319 11:20:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:20:33.409771 543705 memory.go:184] no items to output this cycle
I0319 11:20:33.409813 543705 cpu.go:275] no items to output this cycle
E0319 11:20:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:20:43.409815 543705 memory.go:191] Add success.
I0319 11:20:43.409825 543705 cpu.go:282] Add success.
I0319 11:20:43.419966 543705 net.go:648] Add success.
I0319 11:20:43.422961 543705 net.go:770] primary dev: ETH0
I0319 11:20:43.422974 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:20:43.422986 543705 net.go:698] Add success.
I0319 11:20:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:20:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:20:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:20:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:20:53.409781 543705 memory.go:184] no items to output this cycle
I0319 11:20:53.409786 543705 cpu.go:275] no items to output this cycle
E0319 11:21:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:21:03.409783 543705 memory.go:184] no items to output this cycle
I0319 11:21:03.409803 543705 cpu.go:275] no items to output this cycle
E0319 11:21:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:21:13.409783 543705 memory.go:191] Add success.
I0319 11:21:13.409805 543705 cpu.go:282] Add success.
W0319 11:21:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:21:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:21:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:21:13.420052 543705 net.go:648] Add success.
I0319 11:21:13.422746 543705 net.go:770] primary dev: ETH0
I0319 11:21:13.422763 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:21:13.422776 543705 net.go:698] Add success.
I0319 11:21:13.471296 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"52f862d9-306e-45bb-85ce-e71796e961cf","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 11:21:13.471329 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 11:21:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:21:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:21:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0319 11:21:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:21:14.456575 543705 disk_worker.go:494] system disk:vda1
I0319 11:21:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:21:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:21:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:21:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:21:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:21:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:21:18.861672 543705 disk_info.go:125] begin check local disk info of client
I0319 11:21:18.864043 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:21:18.864049 543705 disk_info.go:196] parse disk info done, disk is : [0xc000492ec0 0xc000492f00]
E0319 11:21:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:21:23.409773 543705 cpu.go:275] no items to output this cycle
I0319 11:21:23.409780 543705 memory.go:184] no items to output this cycle
E0319 11:21:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:21:33.409805 543705 memory.go:184] no items to output this cycle
I0319 11:21:33.409818 543705 cpu.go:275] no items to output this cycle
I0319 11:21:37.692431 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 11:21:37.692438 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 11:21:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:21:43.410627 543705 memory.go:191] Add success.
I0319 11:21:43.409809 543705 cpu.go:282] Add success.
I0319 11:21:43.420333 543705 net.go:648] Add success.
I0319 11:21:43.422910 543705 net.go:770] primary dev: ETH0
I0319 11:21:43.422934 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:21:43.422949 543705 net.go:698] Add success.
I0319 11:21:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:21:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:21:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:21:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:21:53.409793 543705 memory.go:184] no items to output this cycle
I0319 11:21:53.409808 543705 cpu.go:275] no items to output this cycle
E0319 11:22:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:22:03.409791 543705 memory.go:184] no items to output this cycle
I0319 11:22:03.409794 543705 cpu.go:275] no items to output this cycle
E0319 11:22:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:22:13.409786 543705 memory.go:191] Add success.
I0319 11:22:13.409786 543705 cpu.go:282] Add success.
W0319 11:22:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:22:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:22:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:22:13.420152 543705 net.go:648] Add success.
I0319 11:22:13.423181 543705 net.go:770] primary dev: ETH0
I0319 11:22:13.423193 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:22:13.423205 543705 net.go:698] Add success.
W0319 11:22:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:22:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0319 11:22:14.455193 543705 disk_worker.go:728] disk inode is not compliant
E0319 11:22:14.456807 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 11:22:14.456816 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 11:22:14.456823 543705 custom_config.go:64] query custom config with name: gpu
I0319 11:22:14.456926 543705 disk_worker.go:494] system disk:vda1
I0319 11:22:14.456955 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 11:22:15.456832 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 11:22:15.456841 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:22:16.457951 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 11:22:16.457960 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 11:22:16.458012 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:22:16.458029 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:22:16.472364 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:22:18.865665 543705 disk_info.go:125] begin check local disk info of client
I0319 11:22:18.867952 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:22:18.867958 543705 disk_info.go:196] parse disk info done, disk is : [0xc000509ac0 0xc000509b00]
E0319 11:22:23.409742 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:22:23.409837 543705 memory.go:184] no items to output this cycle
I0319 11:22:23.409950 543705 cpu.go:275] no items to output this cycle
E0319 11:22:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:22:33.409811 543705 memory.go:184] no items to output this cycle
I0319 11:22:33.409821 543705 cpu.go:275] no items to output this cycle
E0319 11:22:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:22:43.409781 543705 memory.go:191] Add success.
I0319 11:22:43.409806 543705 cpu.go:282] Add success.
I0319 11:22:43.419877 543705 net.go:648] Add success.
I0319 11:22:43.422490 543705 net.go:770] primary dev: ETH0
I0319 11:22:43.422505 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:22:43.422518 543705 net.go:698] Add success.
I0319 11:22:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:22:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:22:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:22:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:22:53.409775 543705 memory.go:184] no items to output this cycle
I0319 11:22:53.409779 543705 cpu.go:275] no items to output this cycle
E0319 11:23:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:23:03.409782 543705 memory.go:184] no items to output this cycle
I0319 11:23:03.409808 543705 cpu.go:275] no items to output this cycle
E0319 11:23:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:23:13.409817 543705 memory.go:191] Add success.
I0319 11:23:13.409824 543705 cpu.go:282] Add success.
W0319 11:23:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:23:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:23:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:23:13.420163 543705 net.go:648] Add success.
I0319 11:23:13.423121 543705 net.go:770] primary dev: ETH0
I0319 11:23:13.423134 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:23:13.423145 543705 net.go:698] Add success.
I0319 11:23:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:23:14.455098 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:23:14.455156 543705 disk_worker.go:708] disk space is not compliant
W0319 11:23:14.455158 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:23:14.456489 543705 disk_worker.go:494] system disk:vda1
I0319 11:23:14.456543 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:23:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:23:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:23:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:23:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:23:16.472413 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:23:18.869675 543705 disk_info.go:125] begin check local disk info of client
I0319 11:23:18.872028 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:23:18.872034 543705 disk_info.go:196] parse disk info done, disk is : [0xc000345800 0xc000345840]
E0319 11:23:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:23:23.409771 543705 cpu.go:275] no items to output this cycle
I0319 11:23:23.409773 543705 memory.go:184] no items to output this cycle
E0319 11:23:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:23:33.409796 543705 memory.go:184] no items to output this cycle
I0319 11:23:33.409800 543705 cpu.go:275] no items to output this cycle
E0319 11:23:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:23:43.409787 543705 memory.go:191] Add success.
I0319 11:23:43.409805 543705 cpu.go:282] Add success.
I0319 11:23:43.420012 543705 net.go:648] Add success.
I0319 11:23:43.422846 543705 net.go:770] primary dev: ETH0
I0319 11:23:43.422860 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:23:43.422872 543705 net.go:698] Add success.
I0319 11:23:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:23:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:23:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:23:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:23:53.409777 543705 memory.go:184] no items to output this cycle
I0319 11:23:53.409779 543705 cpu.go:275] no items to output this cycle
E0319 11:24:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:24:03.409786 543705 cpu.go:275] no items to output this cycle
I0319 11:24:03.409792 543705 memory.go:184] no items to output this cycle
E0319 11:24:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:24:13.409809 543705 memory.go:191] Add success.
I0319 11:24:13.409817 543705 cpu.go:282] Add success.
W0319 11:24:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:24:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:24:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:24:13.420230 543705 net.go:648] Add success.
I0319 11:24:13.423404 543705 net.go:770] primary dev: ETH0
I0319 11:24:13.423418 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:24:13.423430 543705 net.go:698] Add success.
I0319 11:24:13.469807 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5d0ef736-934e-4a85-ac1d-59401a082d9f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 11:24:13.469865 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 11:24:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:24:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:24:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0319 11:24:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:24:14.456750 543705 disk_worker.go:494] system disk:vda1
I0319 11:24:14.456778 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:24:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:24:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:24:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:24:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:24:16.472408 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:24:18.873672 543705 disk_info.go:125] begin check local disk info of client
I0319 11:24:18.876016 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:24:18.876022 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039cd80 0xc00039cdc0]
E0319 11:24:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:24:23.409795 543705 memory.go:184] no items to output this cycle
I0319 11:24:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 11:24:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:24:33.409813 543705 memory.go:184] no items to output this cycle
I0319 11:24:33.409823 543705 cpu.go:275] no items to output this cycle
I0319 11:24:37.692583 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 11:24:37.692590 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 11:24:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:24:43.410714 543705 memory.go:191] Add success.
I0319 11:24:43.409818 543705 cpu.go:282] Add success.
I0319 11:24:43.420451 543705 net.go:648] Add success.
I0319 11:24:43.423548 543705 net.go:770] primary dev: ETH0
I0319 11:24:43.423563 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:24:43.423578 543705 net.go:698] Add success.
I0319 11:24:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:24:46.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:24:46.458055 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:24:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:24:53.409776 543705 memory.go:184] no items to output this cycle
I0319 11:24:53.409799 543705 cpu.go:275] no items to output this cycle
E0319 11:25:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:25:03.409803 543705 memory.go:184] no items to output this cycle
I0319 11:25:03.409817 543705 cpu.go:275] no items to output this cycle
E0319 11:25:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:25:13.409793 543705 memory.go:191] Add success.
I0319 11:25:13.409817 543705 cpu.go:282] Add success.
W0319 11:25:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:25:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:25:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:25:13.419965 543705 net.go:770] primary dev: ETH0
I0319 11:25:13.419979 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:25:13.419991 543705 net.go:698] Add success.
I0319 11:25:13.420221 543705 net.go:648] Add success.
I0319 11:25:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:25:14.455095 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:25:14.455159 543705 disk_worker.go:708] disk space is not compliant
W0319 11:25:14.455162 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:25:14.456475 543705 disk_worker.go:494] system disk:vda1
I0319 11:25:14.456519 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:25:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:25:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:25:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:25:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:25:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:25:18.877668 543705 disk_info.go:125] begin check local disk info of client
I0319 11:25:18.880041 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:25:18.880046 543705 disk_info.go:196] parse disk info done, disk is : [0xc000396780 0xc0003967c0]
E0319 11:25:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:25:23.409776 543705 memory.go:184] no items to output this cycle
I0319 11:25:23.409797 543705 cpu.go:275] no items to output this cycle
E0319 11:25:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:25:33.409810 543705 memory.go:184] no items to output this cycle
I0319 11:25:33.409824 543705 cpu.go:275] no items to output this cycle
E0319 11:25:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:25:43.409823 543705 memory.go:191] Add success.
I0319 11:25:43.409830 543705 cpu.go:282] Add success.
I0319 11:25:43.419899 543705 net.go:648] Add success.
I0319 11:25:43.422676 543705 net.go:770] primary dev: ETH0
I0319 11:25:43.422690 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:25:43.422701 543705 net.go:698] Add success.
I0319 11:25:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:25:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:25:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:25:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:25:53.409807 543705 memory.go:184] no items to output this cycle
I0319 11:25:53.409812 543705 cpu.go:275] no items to output this cycle
E0319 11:26:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:26:03.409790 543705 memory.go:184] no items to output this cycle
I0319 11:26:03.409813 543705 cpu.go:275] no items to output this cycle
E0319 11:26:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:26:13.409797 543705 memory.go:191] Add success.
I0319 11:26:13.409823 543705 cpu.go:282] Add success.
W0319 11:26:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:26:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:26:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:26:13.420246 543705 net.go:648] Add success.
I0319 11:26:13.423282 543705 net.go:770] primary dev: ETH0
I0319 11:26:13.423296 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:26:13.423309 543705 net.go:698] Add success.
I0319 11:26:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:26:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:26:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0319 11:26:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:26:14.456564 543705 disk_worker.go:494] system disk:vda1
I0319 11:26:14.456595 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:26:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:26:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:26:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:26:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:26:16.472393 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:26:18.881669 543705 disk_info.go:125] begin check local disk info of client
I0319 11:26:18.884018 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:26:18.884024 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001f2080 0xc0001f20c0]
E0319 11:26:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:26:23.409796 543705 memory.go:184] no items to output this cycle
I0319 11:26:23.409808 543705 cpu.go:275] no items to output this cycle
E0319 11:26:33.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:26:33.409821 543705 memory.go:184] no items to output this cycle
I0319 11:26:33.409829 543705 cpu.go:275] no items to output this cycle
E0319 11:26:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:26:43.409789 543705 memory.go:191] Add success.
I0319 11:26:43.409793 543705 cpu.go:282] Add success.
I0319 11:26:43.420303 543705 net.go:648] Add success.
I0319 11:26:43.423053 543705 net.go:770] primary dev: ETH0
I0319 11:26:43.423066 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:26:43.423077 543705 net.go:698] Add success.
I0319 11:26:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:26:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:26:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:26:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:26:53.409800 543705 memory.go:184] no items to output this cycle
I0319 11:26:53.409809 543705 cpu.go:275] no items to output this cycle
E0319 11:27:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:27:03.409804 543705 memory.go:184] no items to output this cycle
I0319 11:27:03.409818 543705 cpu.go:275] no items to output this cycle
E0319 11:27:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:27:13.409782 543705 memory.go:191] Add success.
W0319 11:27:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 11:27:13.409806 543705 cpu.go:282] Add success.
W0319 11:27:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:27:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:27:13.420110 543705 net.go:648] Add success.
I0319 11:27:13.428797 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 11:27:13.428882 543705 net.go:770] primary dev: ETH0
I0319 11:27:13.428894 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:27:13.428904 543705 net.go:698] Add success.
I0319 11:27:13.453417 543705 event_worker.go:152] Polling the log file for events...
I0319 11:27:13.468645 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"fe6d4fdc-b043-40be-a5bc-f5bb1f2271d8","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 11:27:13.468677 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 11:27:14.455162 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:27:14.455254 543705 disk_worker.go:708] disk space is not compliant
W0319 11:27:14.455259 543705 disk_worker.go:728] disk inode is not compliant
E0319 11:27:14.456100 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 11:27:14.456110 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 11:27:14.456116 543705 custom_config.go:64] query custom config with name: gpu
I0319 11:27:14.456933 543705 disk_worker.go:494] system disk:vda1
I0319 11:27:14.456964 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 11:27:15.456818 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 11:27:15.456826 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:27:16.457913 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 11:27:16.457913 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 11:27:16.457969 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:27:16.457988 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:27:16.472289 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:27:18.885671 543705 disk_info.go:125] begin check local disk info of client
I0319 11:27:18.888067 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:27:18.888073 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002aa440 0xc0002aa480]
E0319 11:27:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:27:23.409781 543705 memory.go:184] no items to output this cycle
I0319 11:27:23.409781 543705 cpu.go:275] no items to output this cycle
E0319 11:27:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:27:33.409771 543705 memory.go:184] no items to output this cycle
I0319 11:27:33.409805 543705 cpu.go:275] no items to output this cycle
I0319 11:27:37.693159 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 11:27:37.693165 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 11:27:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:27:43.410769 543705 memory.go:191] Add success.
I0319 11:27:43.409821 543705 cpu.go:282] Add success.
I0319 11:27:43.420749 543705 net.go:648] Add success.
I0319 11:27:43.423426 543705 net.go:770] primary dev: ETH0
I0319 11:27:43.423439 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:27:43.423451 543705 net.go:698] Add success.
I0319 11:27:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:27:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:27:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:27:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:27:53.409803 543705 memory.go:184] no items to output this cycle
I0319 11:27:53.409815 543705 cpu.go:275] no items to output this cycle
E0319 11:28:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:28:03.409802 543705 memory.go:184] no items to output this cycle
I0319 11:28:03.409816 543705 cpu.go:275] no items to output this cycle
E0319 11:28:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:28:13.409778 543705 memory.go:191] Add success.
W0319 11:28:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 11:28:13.409807 543705 cpu.go:282] Add success.
W0319 11:28:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:28:13.409819 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:28:13.420099 543705 net.go:648] Add success.
I0319 11:28:13.422987 543705 net.go:770] primary dev: ETH0
I0319 11:28:13.422999 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:28:13.423012 543705 net.go:698] Add success.
I0319 11:28:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:28:14.455128 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:28:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0319 11:28:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:28:14.456562 543705 disk_worker.go:494] system disk:vda1
I0319 11:28:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:28:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:28:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:28:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:28:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:28:16.472377 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:28:18.889674 543705 disk_info.go:125] begin check local disk info of client
I0319 11:28:18.892008 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:28:18.892014 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aaf00 0xc0001aaf40]
E0319 11:28:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:28:23.409789 543705 memory.go:184] no items to output this cycle
I0319 11:28:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 11:28:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:28:33.409789 543705 cpu.go:275] no items to output this cycle
I0319 11:28:33.409793 543705 memory.go:184] no items to output this cycle
E0319 11:28:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:28:43.409810 543705 memory.go:191] Add success.
I0319 11:28:43.409819 543705 cpu.go:282] Add success.
I0319 11:28:43.420039 543705 net.go:648] Add success.
I0319 11:28:43.422833 543705 net.go:770] primary dev: ETH0
I0319 11:28:43.422846 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:28:43.422858 543705 net.go:698] Add success.
I0319 11:28:46.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:28:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:28:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:28:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:28:53.409787 543705 memory.go:184] no items to output this cycle
I0319 11:28:53.409785 543705 cpu.go:275] no items to output this cycle
E0319 11:29:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:29:03.409785 543705 memory.go:184] no items to output this cycle
I0319 11:29:03.409788 543705 cpu.go:275] no items to output this cycle
E0319 11:29:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:29:13.409806 543705 memory.go:191] Add success.
I0319 11:29:13.409820 543705 cpu.go:282] Add success.
W0319 11:29:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:29:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:29:13.409853 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:29:13.420314 543705 net.go:648] Add success.
I0319 11:29:13.422853 543705 net.go:770] primary dev: ETH0
I0319 11:29:13.422867 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:29:13.422879 543705 net.go:698] Add success.
I0319 11:29:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:29:14.455128 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:29:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0319 11:29:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:29:14.456593 543705 disk_worker.go:494] system disk:vda1
I0319 11:29:14.456623 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:29:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:29:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:29:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:29:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:29:16.472457 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:29:18.893670 543705 disk_info.go:125] begin check local disk info of client
I0319 11:29:18.896064 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:29:18.896071 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4400 0xc0000c4440]
E0319 11:29:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:29:23.409771 543705 cpu.go:275] no items to output this cycle
I0319 11:29:23.409776 543705 memory.go:184] no items to output this cycle
E0319 11:29:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:29:33.409802 543705 memory.go:184] no items to output this cycle
I0319 11:29:33.409816 543705 cpu.go:275] no items to output this cycle
E0319 11:29:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:29:43.409828 543705 memory.go:191] Add success.
I0319 11:29:43.409830 543705 cpu.go:282] Add success.
I0319 11:29:43.419973 543705 net.go:648] Add success.
I0319 11:29:43.423111 543705 net.go:770] primary dev: ETH0
I0319 11:29:43.423124 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:29:43.423136 543705 net.go:698] Add success.
I0319 11:29:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:29:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:29:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:29:53.409875 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:29:53.409909 543705 cpu.go:275] no items to output this cycle
I0319 11:29:53.409938 543705 memory.go:184] no items to output this cycle
E0319 11:30:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:30:03.409814 543705 memory.go:184] no items to output this cycle
I0319 11:30:03.409826 543705 cpu.go:275] no items to output this cycle
E0319 11:30:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:30:13.409787 543705 memory.go:191] Add success.
I0319 11:30:13.409788 543705 cpu.go:282] Add success.
W0319 11:30:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:30:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:30:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:30:13.420331 543705 net.go:648] Add success.
I0319 11:30:13.423192 543705 net.go:770] primary dev: ETH0
I0319 11:30:13.423205 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:30:13.423218 543705 net.go:698] Add success.
I0319 11:30:13.469771 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"922b6db6-8fed-4ab9-87f9-3d27ded68eac","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 11:30:13.469819 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 11:30:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:30:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:30:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0319 11:30:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:30:14.456661 543705 disk_worker.go:494] system disk:vda1
I0319 11:30:14.456691 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:30:15.455604 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:30:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:30:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:30:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:30:16.472390 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:30:18.897670 543705 disk_info.go:125] begin check local disk info of client
I0319 11:30:18.900118 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:30:18.900125 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007bc80 0xc00007bcc0]
E0319 11:30:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:30:23.409774 543705 cpu.go:275] no items to output this cycle
I0319 11:30:23.409784 543705 memory.go:184] no items to output this cycle
E0319 11:30:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:30:33.409787 543705 cpu.go:275] no items to output this cycle
I0319 11:30:33.409794 543705 memory.go:184] no items to output this cycle
I0319 11:30:37.693736 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 11:30:37.693743 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 11:30:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:30:43.410629 543705 memory.go:191] Add success.
I0319 11:30:43.409826 543705 cpu.go:282] Add success.
I0319 11:30:43.420323 543705 net.go:648] Add success.
I0319 11:30:43.423187 543705 net.go:770] primary dev: ETH0
I0319 11:30:43.423200 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:30:43.423213 543705 net.go:698] Add success.
I0319 11:30:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:30:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:30:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:30:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:30:53.409777 543705 memory.go:184] no items to output this cycle
I0319 11:30:53.409890 543705 cpu.go:275] no items to output this cycle
E0319 11:31:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:31:03.409766 543705 memory.go:184] no items to output this cycle
I0319 11:31:03.409812 543705 cpu.go:275] no items to output this cycle
E0319 11:31:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:31:13.409808 543705 memory.go:191] Add success.
I0319 11:31:13.409813 543705 cpu.go:282] Add success.
W0319 11:31:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:31:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:31:13.409859 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:31:13.420067 543705 net.go:648] Add success.
I0319 11:31:13.422771 543705 net.go:770] primary dev: ETH0
I0319 11:31:13.422786 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:31:13.422800 543705 net.go:698] Add success.
I0319 11:31:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:31:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:31:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0319 11:31:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:31:14.456581 543705 disk_worker.go:494] system disk:vda1
I0319 11:31:14.456611 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:31:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:31:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:31:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:31:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:31:16.472430 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:31:18.901681 543705 disk_info.go:125] begin check local disk info of client
I0319 11:31:18.904029 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:31:18.904035 543705 disk_info.go:196] parse disk info done, disk is : [0xc000508700 0xc000508740]
E0319 11:31:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:31:23.409791 543705 memory.go:184] no items to output this cycle
I0319 11:31:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 11:31:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:31:33.409781 543705 memory.go:184] no items to output this cycle
I0319 11:31:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 11:31:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:31:43.409808 543705 memory.go:191] Add success.
I0319 11:31:43.409818 543705 cpu.go:282] Add success.
I0319 11:31:43.419955 543705 net.go:648] Add success.
I0319 11:31:43.422997 543705 net.go:770] primary dev: ETH0
I0319 11:31:43.423013 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:31:43.423030 543705 net.go:698] Add success.
I0319 11:31:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:31:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:31:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:31:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:31:53.409784 543705 cpu.go:275] no items to output this cycle
I0319 11:31:53.409790 543705 memory.go:184] no items to output this cycle
E0319 11:32:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:32:03.409790 543705 memory.go:184] no items to output this cycle
I0319 11:32:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 11:32:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:32:13.409793 543705 memory.go:191] Add success.
I0319 11:32:13.409794 543705 cpu.go:282] Add success.
W0319 11:32:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:32:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:32:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:32:13.420211 543705 net.go:648] Add success.
I0319 11:32:13.422897 543705 net.go:770] primary dev: ETH0
I0319 11:32:13.422910 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:32:13.422922 543705 net.go:698] Add success.
W0319 11:32:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:32:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0319 11:32:14.455174 543705 disk_worker.go:728] disk inode is not compliant
E0319 11:32:14.455901 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 11:32:14.455910 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 11:32:14.455916 543705 custom_config.go:64] query custom config with name: gpu
I0319 11:32:14.456550 543705 disk_worker.go:494] system disk:vda1
I0319 11:32:14.456579 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 11:32:15.456813 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 11:32:15.456822 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:32:16.457955 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 11:32:16.457954 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 11:32:16.458007 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:32:16.458026 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:32:16.472349 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:32:18.905673 543705 disk_info.go:125] begin check local disk info of client
I0319 11:32:18.907997 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:32:18.908003 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b50c0 0xc0002b5100]
E0319 11:32:23.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:32:23.409761 543705 memory.go:184] no items to output this cycle
I0319 11:32:23.409794 543705 cpu.go:275] no items to output this cycle
E0319 11:32:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:32:33.409778 543705 memory.go:184] no items to output this cycle
I0319 11:32:33.409808 543705 cpu.go:275] no items to output this cycle
E0319 11:32:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:32:43.409779 543705 memory.go:191] Add success.
I0319 11:32:43.409800 543705 cpu.go:282] Add success.
I0319 11:32:43.419859 543705 net.go:648] Add success.
I0319 11:32:43.422546 543705 net.go:770] primary dev: ETH0
I0319 11:32:43.422559 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:32:43.422573 543705 net.go:698] Add success.
I0319 11:32:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:32:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:32:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:32:53.410273 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:32:53.410303 543705 memory.go:184] no items to output this cycle
I0319 11:32:53.410303 543705 cpu.go:275] no items to output this cycle
E0319 11:33:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:33:03.409791 543705 memory.go:184] no items to output this cycle
I0319 11:33:03.409797 543705 cpu.go:275] no items to output this cycle
E0319 11:33:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:33:13.409793 543705 memory.go:191] Add success.
I0319 11:33:13.409798 543705 cpu.go:282] Add success.
W0319 11:33:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:33:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:33:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:33:13.420200 543705 net.go:648] Add success.
I0319 11:33:13.423058 543705 net.go:770] primary dev: ETH0
I0319 11:33:13.423073 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:33:13.423087 543705 net.go:698] Add success.
I0319 11:33:13.469173 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"dec46e48-f5a5-41b7-8f27-eb876005629d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 11:33:13.469210 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 11:33:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:33:14.455135 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:33:14.455225 543705 disk_worker.go:708] disk space is not compliant
W0319 11:33:14.455229 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:33:14.456782 543705 disk_worker.go:494] system disk:vda1
I0319 11:33:14.456810 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:33:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:33:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:33:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:33:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:33:16.472391 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:33:18.909671 543705 disk_info.go:125] begin check local disk info of client
I0319 11:33:18.912013 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:33:18.912018 543705 disk_info.go:196] parse disk info done, disk is : [0xc000466800 0xc000466840]
E0319 11:33:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:33:23.409774 543705 cpu.go:275] no items to output this cycle
I0319 11:33:23.409779 543705 memory.go:184] no items to output this cycle
E0319 11:33:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:33:33.409778 543705 memory.go:184] no items to output this cycle
I0319 11:33:33.409807 543705 cpu.go:275] no items to output this cycle
I0319 11:33:37.695162 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 11:33:37.695169 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 11:33:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:33:43.410837 543705 memory.go:191] Add success.
I0319 11:33:43.409817 543705 cpu.go:282] Add success.
I0319 11:33:43.420539 543705 net.go:648] Add success.
I0319 11:33:43.423259 543705 net.go:770] primary dev: ETH0
I0319 11:33:43.423273 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:33:43.423455 543705 net.go:698] Add success.
I0319 11:33:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:33:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:33:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:33:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:33:53.409805 543705 memory.go:184] no items to output this cycle
I0319 11:33:53.409816 543705 cpu.go:275] no items to output this cycle
E0319 11:34:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:34:03.409809 543705 memory.go:184] no items to output this cycle
I0319 11:34:03.409823 543705 cpu.go:275] no items to output this cycle
E0319 11:34:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:34:13.409781 543705 memory.go:191] Add success.
W0319 11:34:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 11:34:13.409815 543705 cpu.go:282] Add success.
W0319 11:34:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:34:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:34:13.420036 543705 net.go:648] Add success.
I0319 11:34:13.422935 543705 net.go:770] primary dev: ETH0
I0319 11:34:13.422948 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:34:13.422961 543705 net.go:698] Add success.
I0319 11:34:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:34:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:34:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0319 11:34:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:34:14.456565 543705 disk_worker.go:494] system disk:vda1
I0319 11:34:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:34:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:34:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:34:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:34:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:34:16.472390 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:34:18.913670 543705 disk_info.go:125] begin check local disk info of client
I0319 11:34:18.916005 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:34:18.916012 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab000 0xc0001ab040]
E0319 11:34:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:34:23.409770 543705 cpu.go:275] no items to output this cycle
I0319 11:34:23.409774 543705 memory.go:184] no items to output this cycle
E0319 11:34:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:34:33.409786 543705 cpu.go:275] no items to output this cycle
I0319 11:34:33.409791 543705 memory.go:184] no items to output this cycle
E0319 11:34:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:34:43.409792 543705 memory.go:191] Add success.
I0319 11:34:43.409795 543705 cpu.go:282] Add success.
I0319 11:34:43.419838 543705 net.go:648] Add success.
I0319 11:34:43.422794 543705 net.go:770] primary dev: ETH0
I0319 11:34:43.422813 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:34:43.422828 543705 net.go:698] Add success.
I0319 11:34:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:34:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:34:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:34:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:34:53.409780 543705 memory.go:184] no items to output this cycle
I0319 11:34:53.409787 543705 cpu.go:275] no items to output this cycle
E0319 11:35:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:35:03.409878 543705 cpu.go:275] no items to output this cycle
I0319 11:35:03.409889 543705 memory.go:184] no items to output this cycle
E0319 11:35:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:35:13.409808 543705 memory.go:191] Add success.
I0319 11:35:13.409826 543705 cpu.go:282] Add success.
W0319 11:35:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:35:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:35:13.409859 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:35:13.420101 543705 net.go:648] Add success.
I0319 11:35:13.422638 543705 net.go:770] primary dev: ETH0
I0319 11:35:13.422650 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:35:13.422662 543705 net.go:698] Add success.
I0319 11:35:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:35:14.455164 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:35:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0319 11:35:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:35:14.456501 543705 disk_worker.go:494] system disk:vda1
I0319 11:35:14.456548 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:35:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:35:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:35:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:35:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:35:16.472369 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:35:18.917670 543705 disk_info.go:125] begin check local disk info of client
I0319 11:35:18.920085 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:35:18.920092 543705 disk_info.go:196] parse disk info done, disk is : [0xc000508a00 0xc000508a40]
E0319 11:35:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:35:23.409792 543705 memory.go:184] no items to output this cycle
I0319 11:35:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 11:35:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:35:33.409782 543705 memory.go:184] no items to output this cycle
I0319 11:35:33.409797 543705 cpu.go:275] no items to output this cycle
E0319 11:35:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:35:43.409808 543705 memory.go:191] Add success.
I0319 11:35:43.409818 543705 cpu.go:282] Add success.
I0319 11:35:43.419892 543705 net.go:648] Add success.
I0319 11:35:43.422540 543705 net.go:770] primary dev: ETH0
I0319 11:35:43.422553 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:35:43.422565 543705 net.go:698] Add success.
I0319 11:35:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:35:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:35:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:35:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:35:53.409773 543705 memory.go:184] no items to output this cycle
I0319 11:35:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 11:36:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:36:03.409786 543705 memory.go:184] no items to output this cycle
I0319 11:36:03.409788 543705 cpu.go:275] no items to output this cycle
E0319 11:36:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:36:13.409782 543705 memory.go:191] Add success.
W0319 11:36:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 11:36:13.409813 543705 cpu.go:282] Add success.
W0319 11:36:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:36:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:36:13.420155 543705 net.go:648] Add success.
I0319 11:36:13.423099 543705 net.go:770] primary dev: ETH0
I0319 11:36:13.423112 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:36:13.423125 543705 net.go:698] Add success.
I0319 11:36:13.743848 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"503c2a59-3b1b-4985-b470-ad7b87872355","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 11:36:13.743881 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 11:36:14.454456 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:36:14.454676 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:36:14.454687 543705 disk_worker.go:708] disk space is not compliant
W0319 11:36:14.454689 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:36:14.456024 543705 disk_worker.go:494] system disk:vda1
I0319 11:36:14.456068 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:36:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:36:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:36:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:36:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:36:16.472367 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:36:18.921671 543705 disk_info.go:125] begin check local disk info of client
I0319 11:36:18.924015 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:36:18.924021 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4f40 0xc0000c4f80]
E0319 11:36:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:36:23.409787 543705 memory.go:184] no items to output this cycle
I0319 11:36:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 11:36:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:36:33.409790 543705 memory.go:184] no items to output this cycle
I0319 11:36:33.409794 543705 cpu.go:275] no items to output this cycle
I0319 11:36:37.696189 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 11:36:37.696196 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 11:36:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:36:43.410665 543705 memory.go:191] Add success.
I0319 11:36:43.409816 543705 cpu.go:282] Add success.
I0319 11:36:43.420373 543705 net.go:648] Add success.
I0319 11:36:43.423036 543705 net.go:770] primary dev: ETH0
I0319 11:36:43.423050 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:36:43.423063 543705 net.go:698] Add success.
I0319 11:36:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:36:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:36:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:36:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:36:53.409767 543705 memory.go:184] no items to output this cycle
I0319 11:36:53.409800 543705 cpu.go:275] no items to output this cycle
E0319 11:37:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:37:03.409786 543705 memory.go:184] no items to output this cycle
I0319 11:37:03.409790 543705 cpu.go:275] no items to output this cycle
E0319 11:37:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:37:13.409779 543705 memory.go:191] Add success.
W0319 11:37:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 11:37:13.409812 543705 cpu.go:282] Add success.
W0319 11:37:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:37:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:37:13.420212 543705 net.go:648] Add success.
I0319 11:37:13.423175 543705 net.go:770] primary dev: ETH0
I0319 11:37:13.423188 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:37:13.423200 543705 net.go:698] Add success.
I0319 11:37:13.453738 543705 event_worker.go:152] Polling the log file for events...
W0319 11:37:14.455147 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:37:14.455159 543705 disk_worker.go:708] disk space is not compliant
W0319 11:37:14.455162 543705 disk_worker.go:728] disk inode is not compliant
E0319 11:37:14.456907 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 11:37:14.456917 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 11:37:14.456922 543705 custom_config.go:64] query custom config with name: gpu
I0319 11:37:14.456992 543705 disk_worker.go:494] system disk:vda1
I0319 11:37:14.457019 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 11:37:15.456806 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 11:37:15.456814 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:37:16.457961 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 11:37:16.457960 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 11:37:16.458015 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:37:16.458036 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:37:16.472355 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:37:18.925670 543705 disk_info.go:125] begin check local disk info of client
I0319 11:37:18.928021 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:37:18.928027 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ee300 0xc0003ee340]
E0319 11:37:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:37:23.409759 543705 memory.go:184] no items to output this cycle
I0319 11:37:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 11:37:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:37:33.409805 543705 memory.go:184] no items to output this cycle
I0319 11:37:33.409819 543705 cpu.go:275] no items to output this cycle
E0319 11:37:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:37:43.409775 543705 memory.go:191] Add success.
I0319 11:37:43.409807 543705 cpu.go:282] Add success.
I0319 11:37:43.419928 543705 net.go:648] Add success.
I0319 11:37:43.422673 543705 net.go:770] primary dev: ETH0
I0319 11:37:43.422688 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:37:43.422702 543705 net.go:698] Add success.
I0319 11:37:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:37:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:37:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:37:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:37:53.409775 543705 memory.go:184] no items to output this cycle
I0319 11:37:53.409805 543705 cpu.go:275] no items to output this cycle
E0319 11:38:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:38:03.409784 543705 memory.go:184] no items to output this cycle
I0319 11:38:03.409918 543705 cpu.go:275] no items to output this cycle
E0319 11:38:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:38:13.409796 543705 memory.go:191] Add success.
I0319 11:38:13.409810 543705 cpu.go:282] Add success.
W0319 11:38:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:38:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:38:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:38:13.420209 543705 net.go:648] Add success.
I0319 11:38:13.423042 543705 net.go:770] primary dev: ETH0
I0319 11:38:13.423057 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:38:13.423071 543705 net.go:698] Add success.
I0319 11:38:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:38:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:38:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0319 11:38:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:38:14.456516 543705 disk_worker.go:494] system disk:vda1
I0319 11:38:14.456548 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:38:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:38:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:38:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:38:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:38:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:38:18.929672 543705 disk_info.go:125] begin check local disk info of client
I0319 11:38:18.932064 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:38:18.932070 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ee080 0xc0001ee0c0]
E0319 11:38:23.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:38:23.409811 543705 memory.go:184] no items to output this cycle
I0319 11:38:23.409825 543705 cpu.go:275] no items to output this cycle
E0319 11:38:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:38:33.409793 543705 cpu.go:275] no items to output this cycle
I0319 11:38:33.409803 543705 memory.go:184] no items to output this cycle
E0319 11:38:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:38:43.409800 543705 memory.go:191] Add success.
I0319 11:38:43.409810 543705 cpu.go:282] Add success.
I0319 11:38:43.420057 543705 net.go:648] Add success.
I0319 11:38:43.422896 543705 net.go:770] primary dev: ETH0
I0319 11:38:43.422911 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:38:43.422926 543705 net.go:698] Add success.
I0319 11:38:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:38:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:38:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:38:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:38:53.409769 543705 memory.go:184] no items to output this cycle
I0319 11:38:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 11:39:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:39:03.409784 543705 memory.go:184] no items to output this cycle
I0319 11:39:03.409799 543705 cpu.go:275] no items to output this cycle
E0319 11:39:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:39:13.409800 543705 memory.go:191] Add success.
I0319 11:39:13.409801 543705 cpu.go:282] Add success.
W0319 11:39:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:39:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:39:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:39:13.420167 543705 net.go:648] Add success.
I0319 11:39:13.423613 543705 net.go:770] primary dev: ETH0
I0319 11:39:13.423628 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:39:13.423640 543705 net.go:698] Add success.
I0319 11:39:13.469614 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"580517cf-1484-4df5-ac68-26b120d099dc","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 11:39:13.469658 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 11:39:14.454947 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:39:14.455090 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:39:14.455152 543705 disk_worker.go:708] disk space is not compliant
W0319 11:39:14.455155 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:39:14.456512 543705 disk_worker.go:494] system disk:vda1
I0319 11:39:14.456570 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:39:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:39:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:39:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:39:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:39:16.472434 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:39:18.933670 543705 disk_info.go:125] begin check local disk info of client
I0319 11:39:18.936039 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:39:18.936045 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004f4600 0xc0004f4640]
E0319 11:39:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:39:23.409794 543705 memory.go:184] no items to output this cycle
I0319 11:39:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 11:39:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:39:33.409812 543705 memory.go:184] no items to output this cycle
I0319 11:39:33.409823 543705 cpu.go:275] no items to output this cycle
I0319 11:39:37.696335 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 11:39:37.696342 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 11:39:43.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:39:43.410665 543705 memory.go:191] Add success.
I0319 11:39:43.409806 543705 cpu.go:282] Add success.
I0319 11:39:43.420387 543705 net.go:648] Add success.
I0319 11:39:43.422893 543705 net.go:770] primary dev: ETH0
I0319 11:39:43.422906 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:39:43.422917 543705 net.go:698] Add success.
I0319 11:39:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:39:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:39:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:39:53.410348 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:39:53.410366 543705 memory.go:184] no items to output this cycle
I0319 11:39:53.410375 543705 cpu.go:275] no items to output this cycle
E0319 11:40:03.410587 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:40:03.410624 543705 memory.go:184] no items to output this cycle
I0319 11:40:03.410636 543705 cpu.go:275] no items to output this cycle
E0319 11:40:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:40:13.409791 543705 memory.go:191] Add success.
I0319 11:40:13.409794 543705 cpu.go:282] Add success.
W0319 11:40:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:40:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:40:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:40:13.420285 543705 net.go:648] Add success.
I0319 11:40:13.423379 543705 net.go:770] primary dev: ETH0
I0319 11:40:13.423393 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:40:13.423407 543705 net.go:698] Add success.
I0319 11:40:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:40:14.455197 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:40:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0319 11:40:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:40:14.456579 543705 disk_worker.go:494] system disk:vda1
I0319 11:40:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:40:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:40:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:40:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:40:16.458049 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:40:16.472375 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:40:18.937670 543705 disk_info.go:125] begin check local disk info of client
I0319 11:40:18.940008 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:40:18.940014 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003412c0 0xc000341300]
E0319 11:40:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:40:23.409797 543705 memory.go:184] no items to output this cycle
I0319 11:40:23.409813 543705 cpu.go:275] no items to output this cycle
E0319 11:40:33.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:40:33.409820 543705 memory.go:184] no items to output this cycle
I0319 11:40:33.409831 543705 cpu.go:275] no items to output this cycle
E0319 11:40:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:40:43.409788 543705 memory.go:191] Add success.
I0319 11:40:43.409817 543705 cpu.go:282] Add success.
I0319 11:40:43.419885 543705 net.go:648] Add success.
I0319 11:40:43.422511 543705 net.go:770] primary dev: ETH0
I0319 11:40:43.422525 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:40:43.422537 543705 net.go:698] Add success.
I0319 11:40:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:40:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:40:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:40:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:40:53.409782 543705 memory.go:184] no items to output this cycle
I0319 11:40:53.409807 543705 cpu.go:275] no items to output this cycle
E0319 11:41:03.409911 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:41:03.409936 543705 memory.go:184] no items to output this cycle
I0319 11:41:03.409945 543705 cpu.go:275] no items to output this cycle
E0319 11:41:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:41:13.409822 543705 memory.go:191] Add success.
I0319 11:41:13.409830 543705 cpu.go:282] Add success.
W0319 11:41:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:41:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:41:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:41:13.420172 543705 net.go:648] Add success.
I0319 11:41:13.423043 543705 net.go:770] primary dev: ETH0
I0319 11:41:13.423056 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:41:13.423069 543705 net.go:698] Add success.
I0319 11:41:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:41:14.455143 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:41:14.455154 543705 disk_worker.go:708] disk space is not compliant
W0319 11:41:14.455156 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:41:14.456473 543705 disk_worker.go:494] system disk:vda1
I0319 11:41:14.456518 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:41:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:41:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:41:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:41:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:41:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:41:18.941671 543705 disk_info.go:125] begin check local disk info of client
I0319 11:41:18.944054 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:41:18.944060 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001eebc0 0xc0001eec00]
E0319 11:41:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:41:23.409797 543705 memory.go:184] no items to output this cycle
I0319 11:41:23.409809 543705 cpu.go:275] no items to output this cycle
E0319 11:41:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:41:33.409799 543705 cpu.go:275] no items to output this cycle
I0319 11:41:33.409804 543705 memory.go:184] no items to output this cycle
E0319 11:41:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:41:43.409805 543705 memory.go:191] Add success.
I0319 11:41:43.409811 543705 cpu.go:282] Add success.
I0319 11:41:43.419988 543705 net.go:648] Add success.
I0319 11:41:43.422845 543705 net.go:770] primary dev: ETH0
I0319 11:41:43.422858 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:41:43.422871 543705 net.go:698] Add success.
I0319 11:41:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:41:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:41:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:41:53.410242 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:41:53.410258 543705 memory.go:184] no items to output this cycle
I0319 11:41:53.410289 543705 cpu.go:275] no items to output this cycle
E0319 11:42:03.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:42:03.409812 543705 memory.go:184] no items to output this cycle
I0319 11:42:03.409825 543705 cpu.go:275] no items to output this cycle
E0319 11:42:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:42:13.409795 543705 memory.go:191] Add success.
I0319 11:42:13.409814 543705 cpu.go:282] Add success.
W0319 11:42:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:42:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:42:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:42:13.420106 543705 net.go:648] Add success.
I0319 11:42:13.423059 543705 net.go:770] primary dev: ETH0
I0319 11:42:13.423072 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:42:13.423084 543705 net.go:698] Add success.
I0319 11:42:13.464193 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"96e52ebe-bc85-4b32-a9b9-d0253d7c3f0b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 11:42:13.464227 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 11:42:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:42:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0319 11:42:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:42:14.456854 543705 disk_worker.go:494] system disk:vda1
E0319 11:42:14.456865 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 11:42:14.456883 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 11:42:14.456888 543705 custom_config.go:64] query custom config with name: gpu
I0319 11:42:14.456892 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 11:42:15.456818 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 11:42:15.456826 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:42:16.457908 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 11:42:16.457910 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 11:42:16.457963 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:42:16.457982 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:42:16.472309 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:42:18.945673 543705 disk_info.go:125] begin check local disk info of client
I0319 11:42:18.948007 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:42:18.948012 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ee980 0xc0003ee9c0]
E0319 11:42:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:42:23.409769 543705 memory.go:184] no items to output this cycle
I0319 11:42:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 11:42:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:42:33.409789 543705 memory.go:184] no items to output this cycle
I0319 11:42:33.409815 543705 cpu.go:275] no items to output this cycle
I0319 11:42:37.697177 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 11:42:37.697184 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 11:42:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:42:43.410622 543705 memory.go:191] Add success.
I0319 11:42:43.409793 543705 cpu.go:282] Add success.
I0319 11:42:43.420324 543705 net.go:648] Add success.
I0319 11:42:43.422953 543705 net.go:770] primary dev: ETH0
I0319 11:42:43.422966 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:42:43.422978 543705 net.go:698] Add success.
I0319 11:42:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:42:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:42:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:42:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:42:53.409783 543705 cpu.go:275] no items to output this cycle
I0319 11:42:53.409795 543705 memory.go:184] no items to output this cycle
E0319 11:43:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:43:03.409780 543705 memory.go:184] no items to output this cycle
I0319 11:43:03.409824 543705 cpu.go:275] no items to output this cycle
E0319 11:43:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:43:13.409799 543705 memory.go:191] Add success.
I0319 11:43:13.409815 543705 cpu.go:282] Add success.
W0319 11:43:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:43:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:43:13.409851 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:43:13.420109 543705 net.go:648] Add success.
I0319 11:43:13.422949 543705 net.go:770] primary dev: ETH0
I0319 11:43:13.422965 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:43:13.422979 543705 net.go:698] Add success.
I0319 11:43:14.454978 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:43:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:43:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0319 11:43:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:43:14.456571 543705 disk_worker.go:494] system disk:vda1
I0319 11:43:14.456602 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:43:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:43:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:43:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:43:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:43:16.472416 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:43:18.949671 543705 disk_info.go:125] begin check local disk info of client
I0319 11:43:18.952023 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:43:18.952029 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ee400 0xc0003ee440]
E0319 11:43:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:43:23.409789 543705 memory.go:184] no items to output this cycle
I0319 11:43:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 11:43:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:43:33.409783 543705 cpu.go:275] no items to output this cycle
I0319 11:43:33.409792 543705 memory.go:184] no items to output this cycle
E0319 11:43:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:43:43.409797 543705 memory.go:191] Add success.
I0319 11:43:43.409799 543705 cpu.go:282] Add success.
I0319 11:43:43.419874 543705 net.go:648] Add success.
I0319 11:43:43.422948 543705 net.go:770] primary dev: ETH0
I0319 11:43:43.422963 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:43:43.422975 543705 net.go:698] Add success.
I0319 11:43:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:43:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:43:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:43:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:43:53.409789 543705 memory.go:184] no items to output this cycle
I0319 11:43:53.409802 543705 cpu.go:275] no items to output this cycle
E0319 11:44:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:44:03.409791 543705 memory.go:184] no items to output this cycle
I0319 11:44:03.409808 543705 cpu.go:275] no items to output this cycle
E0319 11:44:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:44:13.409812 543705 memory.go:191] Add success.
I0319 11:44:13.409820 543705 cpu.go:282] Add success.
W0319 11:44:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:44:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:44:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:44:13.420491 543705 net.go:648] Add success.
I0319 11:44:13.423395 543705 net.go:770] primary dev: ETH0
I0319 11:44:13.423407 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:44:13.423419 543705 net.go:698] Add success.
I0319 11:44:14.454955 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:44:14.455081 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:44:14.455143 543705 disk_worker.go:708] disk space is not compliant
W0319 11:44:14.455146 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:44:14.456480 543705 disk_worker.go:494] system disk:vda1
I0319 11:44:14.456523 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:44:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:44:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:44:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:44:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:44:16.472353 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:44:18.953672 543705 disk_info.go:125] begin check local disk info of client
I0319 11:44:18.956135 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:44:18.956141 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b2500 0xc0003b2540]
E0319 11:44:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:44:23.409767 543705 memory.go:184] no items to output this cycle
I0319 11:44:23.409788 543705 cpu.go:275] no items to output this cycle
E0319 11:44:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:44:33.409779 543705 memory.go:184] no items to output this cycle
I0319 11:44:33.409799 543705 cpu.go:275] no items to output this cycle
E0319 11:44:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:44:43.409802 543705 memory.go:191] Add success.
I0319 11:44:43.409803 543705 cpu.go:282] Add success.
I0319 11:44:43.419882 543705 net.go:648] Add success.
I0319 11:44:43.422870 543705 net.go:770] primary dev: ETH0
I0319 11:44:43.422883 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:44:43.422896 543705 net.go:698] Add success.
I0319 11:44:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:44:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:44:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:44:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:44:53.409793 543705 memory.go:184] no items to output this cycle
I0319 11:44:53.409808 543705 cpu.go:275] no items to output this cycle
E0319 11:45:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:45:03.409777 543705 memory.go:184] no items to output this cycle
I0319 11:45:03.409794 543705 cpu.go:275] no items to output this cycle
E0319 11:45:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:45:13.409801 543705 memory.go:191] Add success.
I0319 11:45:13.409802 543705 cpu.go:282] Add success.
W0319 11:45:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:45:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:45:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:45:13.420166 543705 net.go:648] Add success.
I0319 11:45:13.422958 543705 net.go:770] primary dev: ETH0
I0319 11:45:13.422971 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:45:13.422984 543705 net.go:698] Add success.
I0319 11:45:13.469144 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"796e055a-fd08-4dea-9b49-57d6f22f41c7","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 11:45:13.469182 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 11:45:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:45:14.455100 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:45:14.455163 543705 disk_worker.go:708] disk space is not compliant
W0319 11:45:14.455166 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:45:14.456514 543705 disk_worker.go:494] system disk:vda1
I0319 11:45:14.456569 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:45:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:45:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:45:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:45:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:45:16.472401 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:45:18.957671 543705 disk_info.go:125] begin check local disk info of client
I0319 11:45:18.960094 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:45:18.960099 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ee600 0xc0001ee640]
E0319 11:45:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:45:23.409775 543705 memory.go:184] no items to output this cycle
I0319 11:45:23.409779 543705 cpu.go:275] no items to output this cycle
E0319 11:45:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:45:33.409775 543705 memory.go:184] no items to output this cycle
I0319 11:45:33.409790 543705 cpu.go:275] no items to output this cycle
I0319 11:45:37.697736 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 11:45:37.697744 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 11:45:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:45:43.410736 543705 memory.go:191] Add success.
I0319 11:45:43.409803 543705 cpu.go:282] Add success.
I0319 11:45:43.420498 543705 net.go:648] Add success.
I0319 11:45:43.423176 543705 net.go:770] primary dev: ETH0
I0319 11:45:43.423189 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:45:43.423203 543705 net.go:698] Add success.
I0319 11:45:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:45:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:45:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:45:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:45:53.409773 543705 memory.go:184] no items to output this cycle
I0319 11:45:53.409804 543705 cpu.go:275] no items to output this cycle
E0319 11:46:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:46:03.409789 543705 memory.go:184] no items to output this cycle
I0319 11:46:03.409796 543705 cpu.go:275] no items to output this cycle
E0319 11:46:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:46:13.409799 543705 memory.go:191] Add success.
I0319 11:46:13.409802 543705 cpu.go:282] Add success.
W0319 11:46:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:46:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:46:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:46:13.420363 543705 net.go:648] Add success.
I0319 11:46:13.422956 543705 net.go:770] primary dev: ETH0
I0319 11:46:13.422969 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:46:13.422981 543705 net.go:698] Add success.
I0319 11:46:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:46:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:46:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0319 11:46:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:46:14.456513 543705 disk_worker.go:494] system disk:vda1
I0319 11:46:14.456556 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:46:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:46:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:46:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:46:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:46:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:46:18.961673 543705 disk_info.go:125] begin check local disk info of client
I0319 11:46:18.964029 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:46:18.964034 543705 disk_info.go:196] parse disk info done, disk is : [0xc000344200 0xc000344240]
E0319 11:46:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:46:23.409789 543705 memory.go:184] no items to output this cycle
I0319 11:46:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 11:46:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:46:33.409806 543705 memory.go:184] no items to output this cycle
I0319 11:46:33.409818 543705 cpu.go:275] no items to output this cycle
E0319 11:46:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:46:43.409797 543705 memory.go:191] Add success.
I0319 11:46:43.409798 543705 cpu.go:282] Add success.
I0319 11:46:43.419966 543705 net.go:648] Add success.
I0319 11:46:43.422705 543705 net.go:770] primary dev: ETH0
I0319 11:46:43.422718 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:46:43.422731 543705 net.go:698] Add success.
I0319 11:46:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:46:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:46:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:46:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:46:53.409799 543705 memory.go:184] no items to output this cycle
I0319 11:46:53.409804 543705 cpu.go:275] no items to output this cycle
E0319 11:47:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:47:03.409790 543705 cpu.go:275] no items to output this cycle
I0319 11:47:03.409792 543705 memory.go:184] no items to output this cycle
E0319 11:47:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:47:13.409813 543705 memory.go:191] Add success.
I0319 11:47:13.409821 543705 cpu.go:282] Add success.
W0319 11:47:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:47:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:47:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:47:13.420128 543705 net.go:648] Add success.
I0319 11:47:13.423092 543705 net.go:770] primary dev: ETH0
I0319 11:47:13.423108 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:47:13.423121 543705 net.go:698] Add success.
I0319 11:47:13.453171 543705 event_worker.go:152] Polling the log file for events...
W0319 11:47:14.455169 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:47:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0319 11:47:14.455181 543705 disk_worker.go:728] disk inode is not compliant
E0319 11:47:14.456926 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 11:47:14.456934 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 11:47:14.456940 543705 custom_config.go:64] query custom config with name: gpu
I0319 11:47:14.456991 543705 disk_worker.go:494] system disk:vda1
I0319 11:47:14.457033 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 11:47:15.456843 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 11:47:15.456851 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:47:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 11:47:16.457964 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 11:47:16.458021 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:47:16.458040 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:47:16.472368 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:47:18.965674 543705 disk_info.go:125] begin check local disk info of client
I0319 11:47:18.967978 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:47:18.967983 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004da2c0 0xc0004da300]
E0319 11:47:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:47:23.409758 543705 memory.go:184] no items to output this cycle
I0319 11:47:23.409793 543705 cpu.go:275] no items to output this cycle
E0319 11:47:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:47:33.409807 543705 memory.go:184] no items to output this cycle
I0319 11:47:33.409822 543705 cpu.go:275] no items to output this cycle
E0319 11:47:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:47:43.409786 543705 memory.go:191] Add success.
I0319 11:47:43.409804 543705 cpu.go:282] Add success.
I0319 11:47:43.420002 543705 net.go:648] Add success.
I0319 11:47:43.422591 543705 net.go:770] primary dev: ETH0
I0319 11:47:43.422605 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:47:43.422618 543705 net.go:698] Add success.
I0319 11:47:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:47:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:47:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:47:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:47:53.409780 543705 memory.go:184] no items to output this cycle
I0319 11:47:53.409782 543705 cpu.go:275] no items to output this cycle
E0319 11:48:03.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:48:03.409816 543705 memory.go:184] no items to output this cycle
I0319 11:48:03.409827 543705 cpu.go:275] no items to output this cycle
E0319 11:48:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:48:13.409816 543705 memory.go:191] Add success.
I0319 11:48:13.409817 543705 cpu.go:282] Add success.
W0319 11:48:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:48:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:48:13.409860 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:48:13.420040 543705 net.go:770] primary dev: ETH0
I0319 11:48:13.420053 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:48:13.420065 543705 net.go:698] Add success.
I0319 11:48:13.420300 543705 net.go:648] Add success.
I0319 11:48:13.467805 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"44bff1db-f316-44ac-aa30-3a9ff66eca2a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 11:48:13.467836 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 11:48:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:48:14.455187 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:48:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0319 11:48:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:48:14.456510 543705 disk_worker.go:494] system disk:vda1
I0319 11:48:14.456553 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:48:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:48:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:48:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:48:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:48:16.472381 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:48:18.969670 543705 disk_info.go:125] begin check local disk info of client
I0319 11:48:18.972040 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:48:18.972048 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001abf40 0xc0000c4100]
E0319 11:48:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:48:23.409794 543705 memory.go:184] no items to output this cycle
I0319 11:48:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 11:48:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:48:33.409782 543705 memory.go:184] no items to output this cycle
I0319 11:48:33.409802 543705 cpu.go:275] no items to output this cycle
I0319 11:48:37.699195 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 11:48:37.699207 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 11:48:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:48:43.410600 543705 memory.go:191] Add success.
I0319 11:48:43.409788 543705 cpu.go:282] Add success.
I0319 11:48:43.420344 543705 net.go:648] Add success.
I0319 11:48:43.422828 543705 net.go:770] primary dev: ETH0
I0319 11:48:43.422842 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:48:43.422856 543705 net.go:698] Add success.
I0319 11:48:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:48:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:48:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:48:53.409869 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:48:53.409890 543705 memory.go:184] no items to output this cycle
I0319 11:48:53.409930 543705 cpu.go:275] no items to output this cycle
E0319 11:49:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:49:03.409774 543705 memory.go:184] no items to output this cycle
I0319 11:49:03.409820 543705 cpu.go:275] no items to output this cycle
E0319 11:49:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:49:13.409780 543705 memory.go:191] Add success.
W0319 11:49:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 11:49:13.409806 543705 cpu.go:282] Add success.
W0319 11:49:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:49:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:49:13.420245 543705 net.go:648] Add success.
I0319 11:49:13.423301 543705 net.go:770] primary dev: ETH0
I0319 11:49:13.423315 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:49:13.423330 543705 net.go:698] Add success.
I0319 11:49:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:49:14.455206 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:49:14.455218 543705 disk_worker.go:708] disk space is not compliant
W0319 11:49:14.455221 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:49:14.456604 543705 disk_worker.go:494] system disk:vda1
I0319 11:49:14.456635 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:49:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:49:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:49:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:49:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:49:16.472359 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:49:18.973671 543705 disk_info.go:125] begin check local disk info of client
I0319 11:49:18.976067 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:49:18.976073 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ee340 0xc0003ee380]
E0319 11:49:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:49:23.409771 543705 cpu.go:275] no items to output this cycle
I0319 11:49:23.409781 543705 memory.go:184] no items to output this cycle
E0319 11:49:33.409805 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:49:33.409824 543705 memory.go:184] no items to output this cycle
I0319 11:49:33.409852 543705 cpu.go:275] no items to output this cycle
E0319 11:49:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:49:43.409818 543705 memory.go:191] Add success.
I0319 11:49:43.409824 543705 cpu.go:282] Add success.
I0319 11:49:43.420019 543705 net.go:648] Add success.
I0319 11:49:43.422841 543705 net.go:770] primary dev: ETH0
I0319 11:49:43.422857 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:49:43.422871 543705 net.go:698] Add success.
I0319 11:49:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:49:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:49:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:49:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:49:53.409763 543705 memory.go:184] no items to output this cycle
I0319 11:49:53.409797 543705 cpu.go:275] no items to output this cycle
E0319 11:50:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:50:03.409771 543705 memory.go:184] no items to output this cycle
I0319 11:50:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 11:50:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:50:13.409812 543705 memory.go:191] Add success.
I0319 11:50:13.409822 543705 cpu.go:282] Add success.
W0319 11:50:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:50:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:50:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:50:13.420154 543705 net.go:648] Add success.
I0319 11:50:13.422995 543705 net.go:770] primary dev: ETH0
I0319 11:50:13.423009 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:50:13.423023 543705 net.go:698] Add success.
I0319 11:50:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:50:14.455207 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:50:14.455218 543705 disk_worker.go:708] disk space is not compliant
W0319 11:50:14.455221 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:50:14.456589 543705 disk_worker.go:494] system disk:vda1
I0319 11:50:14.456619 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:50:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:50:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:50:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:50:16.458049 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:50:16.472380 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:50:18.977671 543705 disk_info.go:125] begin check local disk info of client
I0319 11:50:18.980065 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:50:18.980070 543705 disk_info.go:196] parse disk info done, disk is : [0xc000509540 0xc000509580]
E0319 11:50:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:50:23.409765 543705 memory.go:184] no items to output this cycle
I0319 11:50:23.409784 543705 cpu.go:275] no items to output this cycle
E0319 11:50:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:50:33.409808 543705 memory.go:184] no items to output this cycle
I0319 11:50:33.409822 543705 cpu.go:275] no items to output this cycle
E0319 11:50:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:50:43.409787 543705 memory.go:191] Add success.
I0319 11:50:43.409787 543705 cpu.go:282] Add success.
I0319 11:50:43.420193 543705 net.go:648] Add success.
I0319 11:50:43.422872 543705 net.go:770] primary dev: ETH0
I0319 11:50:43.422885 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:50:43.422896 543705 net.go:698] Add success.
I0319 11:50:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:50:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:50:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:50:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:50:53.409769 543705 memory.go:184] no items to output this cycle
I0319 11:50:53.409788 543705 cpu.go:275] no items to output this cycle
E0319 11:51:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:51:03.409807 543705 memory.go:184] no items to output this cycle
I0319 11:51:03.409821 543705 cpu.go:275] no items to output this cycle
E0319 11:51:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:51:13.409791 543705 memory.go:191] Add success.
I0319 11:51:13.409810 543705 cpu.go:282] Add success.
W0319 11:51:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:51:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:51:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:51:13.420163 543705 net.go:648] Add success.
I0319 11:51:13.422816 543705 net.go:770] primary dev: ETH0
I0319 11:51:13.422830 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:51:13.422842 543705 net.go:698] Add success.
I0319 11:51:13.463774 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d938bbb3-f84b-4c32-af99-88dca57cd790","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 11:51:13.463809 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 11:51:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:51:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:51:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0319 11:51:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:51:14.456517 543705 disk_worker.go:494] system disk:vda1
I0319 11:51:14.456561 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:51:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:51:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:51:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:51:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:51:16.472379 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:51:18.981672 543705 disk_info.go:125] begin check local disk info of client
I0319 11:51:18.984066 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:51:18.984072 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004861c0 0xc000486200]
E0319 11:51:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:51:23.409767 543705 memory.go:184] no items to output this cycle
I0319 11:51:23.409795 543705 cpu.go:275] no items to output this cycle
E0319 11:51:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:51:33.409799 543705 memory.go:184] no items to output this cycle
I0319 11:51:33.409811 543705 cpu.go:275] no items to output this cycle
I0319 11:51:37.699350 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 11:51:37.699357 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 11:51:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:51:43.410679 543705 memory.go:191] Add success.
I0319 11:51:43.409827 543705 cpu.go:282] Add success.
I0319 11:51:43.419757 543705 net.go:648] Add success.
I0319 11:51:43.422425 543705 net.go:770] primary dev: ETH0
I0319 11:51:43.422439 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:51:43.422454 543705 net.go:698] Add success.
I0319 11:51:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:51:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:51:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:51:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:51:53.409791 543705 memory.go:184] no items to output this cycle
I0319 11:51:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 11:52:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:52:03.409788 543705 memory.go:184] no items to output this cycle
I0319 11:52:03.409794 543705 cpu.go:275] no items to output this cycle
E0319 11:52:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:52:13.409794 543705 cpu.go:282] Add success.
I0319 11:52:13.409796 543705 memory.go:191] Add success.
W0319 11:52:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:52:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:52:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:52:13.420066 543705 net.go:648] Add success.
I0319 11:52:13.422636 543705 net.go:770] primary dev: ETH0
I0319 11:52:13.422651 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:52:13.422674 543705 net.go:698] Add success.
W0319 11:52:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:52:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0319 11:52:14.455170 543705 disk_worker.go:728] disk inode is not compliant
E0319 11:52:14.456934 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 11:52:14.456944 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 11:52:14.456950 543705 custom_config.go:64] query custom config with name: gpu
I0319 11:52:14.457001 543705 disk_worker.go:494] system disk:vda1
I0319 11:52:14.457031 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 11:52:15.456428 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 11:52:15.456436 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:52:16.457951 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 11:52:16.457951 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 11:52:16.458003 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:52:16.458022 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:52:16.472356 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:52:18.985679 543705 disk_info.go:125] begin check local disk info of client
I0319 11:52:18.987953 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:52:18.987959 543705 disk_info.go:196] parse disk info done, disk is : [0xc000383e00 0xc000383e40]
E0319 11:52:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:52:23.409778 543705 memory.go:184] no items to output this cycle
I0319 11:52:23.409787 543705 cpu.go:275] no items to output this cycle
E0319 11:52:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:52:33.409788 543705 memory.go:184] no items to output this cycle
I0319 11:52:33.409804 543705 cpu.go:275] no items to output this cycle
E0319 11:52:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:52:43.409903 543705 memory.go:191] Add success.
I0319 11:52:43.409930 543705 cpu.go:282] Add success.
I0319 11:52:43.419717 543705 net.go:648] Add success.
I0319 11:52:43.422386 543705 net.go:770] primary dev: ETH0
I0319 11:52:43.422399 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:52:43.422411 543705 net.go:698] Add success.
I0319 11:52:46.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:52:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:52:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:52:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:52:53.409787 543705 memory.go:184] no items to output this cycle
I0319 11:52:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 11:53:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:53:03.409809 543705 memory.go:184] no items to output this cycle
I0319 11:53:03.409813 543705 cpu.go:275] no items to output this cycle
E0319 11:53:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:53:13.409799 543705 memory.go:191] Add success.
I0319 11:53:13.409823 543705 cpu.go:282] Add success.
W0319 11:53:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:53:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:53:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:53:13.420138 543705 net.go:648] Add success.
I0319 11:53:13.422981 543705 net.go:770] primary dev: ETH0
I0319 11:53:13.422996 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:53:13.423010 543705 net.go:698] Add success.
I0319 11:53:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:53:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:53:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0319 11:53:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:53:14.456551 543705 disk_worker.go:494] system disk:vda1
I0319 11:53:14.456581 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:53:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:53:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:53:16.458025 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:53:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:53:16.472352 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:53:18.989673 543705 disk_info.go:125] begin check local disk info of client
I0319 11:53:18.992102 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:53:18.992109 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5d80 0xc0000c5dc0]
E0319 11:53:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:53:23.409781 543705 memory.go:184] no items to output this cycle
I0319 11:53:23.409796 543705 cpu.go:275] no items to output this cycle
E0319 11:53:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:53:33.409785 543705 memory.go:184] no items to output this cycle
I0319 11:53:33.409827 543705 cpu.go:275] no items to output this cycle
E0319 11:53:43.409883 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:53:43.409912 543705 memory.go:191] Add success.
I0319 11:53:43.409936 543705 cpu.go:282] Add success.
I0319 11:53:43.419732 543705 net.go:648] Add success.
I0319 11:53:43.422722 543705 net.go:770] primary dev: ETH0
I0319 11:53:43.422735 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:53:43.422748 543705 net.go:698] Add success.
I0319 11:53:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:53:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:53:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:53:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:53:53.409777 543705 memory.go:184] no items to output this cycle
I0319 11:53:53.409797 543705 cpu.go:275] no items to output this cycle
E0319 11:54:03.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:54:03.409821 543705 memory.go:184] no items to output this cycle
I0319 11:54:03.409835 543705 cpu.go:275] no items to output this cycle
E0319 11:54:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:54:13.409793 543705 memory.go:191] Add success.
I0319 11:54:13.409794 543705 cpu.go:282] Add success.
W0319 11:54:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:54:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:54:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:54:13.420143 543705 net.go:648] Add success.
I0319 11:54:13.422896 543705 net.go:770] primary dev: ETH0
I0319 11:54:13.422910 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:54:13.422924 543705 net.go:698] Add success.
I0319 11:54:13.464871 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d63e06df-3b77-406d-b9aa-1d7ca7dac31a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 11:54:13.464907 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 11:54:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:54:14.455130 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:54:14.455213 543705 disk_worker.go:708] disk space is not compliant
W0319 11:54:14.455217 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:54:14.456722 543705 disk_worker.go:494] system disk:vda1
I0319 11:54:14.456754 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:54:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:54:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:54:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:54:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:54:16.472373 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:54:18.993670 543705 disk_info.go:125] begin check local disk info of client
I0319 11:54:18.996032 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:54:18.996038 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035bc00 0xc00035bc40]
E0319 11:54:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:54:23.409774 543705 memory.go:184] no items to output this cycle
I0319 11:54:23.409795 543705 cpu.go:275] no items to output this cycle
E0319 11:54:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:54:33.409784 543705 memory.go:184] no items to output this cycle
I0319 11:54:33.409806 543705 cpu.go:275] no items to output this cycle
I0319 11:54:37.699500 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 11:54:37.699506 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 11:54:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:54:43.410714 543705 memory.go:191] Add success.
I0319 11:54:43.409813 543705 cpu.go:282] Add success.
I0319 11:54:43.420414 543705 net.go:648] Add success.
I0319 11:54:43.423277 543705 net.go:770] primary dev: ETH0
I0319 11:54:43.423292 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:54:43.423307 543705 net.go:698] Add success.
I0319 11:54:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:54:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:54:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:54:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:54:53.409802 543705 memory.go:184] no items to output this cycle
I0319 11:54:53.409814 543705 cpu.go:275] no items to output this cycle
E0319 11:55:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:55:03.409794 543705 memory.go:184] no items to output this cycle
I0319 11:55:03.409800 543705 cpu.go:275] no items to output this cycle
E0319 11:55:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:55:13.409803 543705 memory.go:191] Add success.
I0319 11:55:13.409804 543705 cpu.go:282] Add success.
W0319 11:55:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:55:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:55:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:55:13.420112 543705 net.go:648] Add success.
I0319 11:55:13.422960 543705 net.go:770] primary dev: ETH0
I0319 11:55:13.422975 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:55:13.422986 543705 net.go:698] Add success.
I0319 11:55:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:55:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:55:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0319 11:55:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:55:14.456485 543705 disk_worker.go:494] system disk:vda1
I0319 11:55:14.456531 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:55:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:55:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:55:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:55:16.458046 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:55:16.472423 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:55:18.997673 543705 disk_info.go:125] begin check local disk info of client
I0319 11:55:19.000046 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:55:19.000052 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035bb40 0xc00035bb80]
E0319 11:55:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:55:23.409798 543705 memory.go:184] no items to output this cycle
I0319 11:55:23.409818 543705 cpu.go:275] no items to output this cycle
E0319 11:55:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:55:33.409779 543705 memory.go:184] no items to output this cycle
I0319 11:55:33.409813 543705 cpu.go:275] no items to output this cycle
E0319 11:55:43.409886 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:55:43.409928 543705 memory.go:191] Add success.
I0319 11:55:43.409966 543705 cpu.go:282] Add success.
I0319 11:55:43.419724 543705 net.go:648] Add success.
I0319 11:55:43.422489 543705 net.go:770] primary dev: ETH0
I0319 11:55:43.422503 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:55:43.422514 543705 net.go:698] Add success.
I0319 11:55:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:55:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:55:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:55:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:55:53.409787 543705 memory.go:184] no items to output this cycle
I0319 11:55:53.409819 543705 cpu.go:275] no items to output this cycle
E0319 11:56:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:56:03.409790 543705 memory.go:184] no items to output this cycle
I0319 11:56:03.409814 543705 cpu.go:275] no items to output this cycle
E0319 11:56:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:56:13.409800 543705 memory.go:191] Add success.
I0319 11:56:13.409802 543705 cpu.go:282] Add success.
W0319 11:56:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:56:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:56:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:56:13.420096 543705 net.go:648] Add success.
I0319 11:56:13.423014 543705 net.go:770] primary dev: ETH0
I0319 11:56:13.423028 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:56:13.423041 543705 net.go:698] Add success.
I0319 11:56:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:56:14.455149 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:56:14.455159 543705 disk_worker.go:708] disk space is not compliant
W0319 11:56:14.455162 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:56:14.456509 543705 disk_worker.go:494] system disk:vda1
I0319 11:56:14.456557 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:56:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:56:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:56:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:56:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:56:16.472354 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:56:19.001679 543705 disk_info.go:125] begin check local disk info of client
I0319 11:56:19.004021 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:56:19.004028 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bc180 0xc0002bc240]
E0319 11:56:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:56:23.409765 543705 memory.go:184] no items to output this cycle
I0319 11:56:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 11:56:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:56:33.409805 543705 memory.go:184] no items to output this cycle
I0319 11:56:33.409807 543705 cpu.go:275] no items to output this cycle
E0319 11:56:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:56:43.409807 543705 cpu.go:282] Add success.
I0319 11:56:43.409799 543705 memory.go:191] Add success.
I0319 11:56:43.419981 543705 net.go:648] Add success.
I0319 11:56:43.422995 543705 net.go:770] primary dev: ETH0
I0319 11:56:43.423025 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:56:43.423039 543705 net.go:698] Add success.
I0319 11:56:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:56:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:56:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:56:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:56:53.409793 543705 memory.go:184] no items to output this cycle
I0319 11:56:53.409805 543705 cpu.go:275] no items to output this cycle
E0319 11:57:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:57:03.409779 543705 memory.go:184] no items to output this cycle
I0319 11:57:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 11:57:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:57:13.409807 543705 memory.go:191] Add success.
I0319 11:57:13.409819 543705 cpu.go:282] Add success.
W0319 11:57:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:57:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:57:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:57:13.420234 543705 net.go:648] Add success.
I0319 11:57:13.428922 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 11:57:13.428997 543705 net.go:770] primary dev: ETH0
I0319 11:57:13.429009 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:57:13.429020 543705 net.go:698] Add success.
I0319 11:57:13.453549 543705 event_worker.go:152] Polling the log file for events...
I0319 11:57:13.464643 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d16c0786-ab2a-4ce3-b5eb-828354d76315","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 11:57:13.464677 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 11:57:14.455152 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:57:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0319 11:57:14.455165 543705 disk_worker.go:728] disk inode is not compliant
E0319 11:57:14.456969 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 11:57:14.456978 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 11:57:14.456984 543705 custom_config.go:64] query custom config with name: gpu
I0319 11:57:14.456996 543705 disk_worker.go:494] system disk:vda1
I0319 11:57:14.457029 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 11:57:15.456778 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 11:57:15.456786 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:57:16.457938 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 11:57:16.457937 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 11:57:16.457990 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:57:16.458009 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:57:16.472327 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:57:19.005672 543705 disk_info.go:125] begin check local disk info of client
I0319 11:57:19.008001 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:57:19.008009 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329f40 0xc000474000]
E0319 11:57:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:57:23.409797 543705 memory.go:184] no items to output this cycle
I0319 11:57:23.409808 543705 cpu.go:275] no items to output this cycle
E0319 11:57:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:57:33.409800 543705 memory.go:184] no items to output this cycle
I0319 11:57:33.409799 543705 cpu.go:275] no items to output this cycle
I0319 11:57:37.700201 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 11:57:37.700208 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 11:57:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:57:43.410760 543705 memory.go:191] Add success.
I0319 11:57:43.409822 543705 cpu.go:282] Add success.
I0319 11:57:43.420479 543705 net.go:648] Add success.
I0319 11:57:43.423727 543705 net.go:770] primary dev: ETH0
I0319 11:57:43.423741 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:57:43.423755 543705 net.go:698] Add success.
I0319 11:57:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:57:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:57:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:57:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:57:53.409784 543705 memory.go:184] no items to output this cycle
I0319 11:57:53.409794 543705 cpu.go:275] no items to output this cycle
E0319 11:58:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:58:03.409789 543705 memory.go:184] no items to output this cycle
I0319 11:58:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 11:58:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:58:13.409797 543705 memory.go:191] Add success.
I0319 11:58:13.409798 543705 cpu.go:282] Add success.
W0319 11:58:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:58:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:58:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:58:13.420108 543705 net.go:648] Add success.
I0319 11:58:13.422750 543705 net.go:770] primary dev: ETH0
I0319 11:58:13.422762 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:58:13.422774 543705 net.go:698] Add success.
I0319 11:58:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:58:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:58:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0319 11:58:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:58:14.456495 543705 disk_worker.go:494] system disk:vda1
I0319 11:58:14.456537 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:58:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:58:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:58:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:58:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:58:16.472388 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:58:19.009671 543705 disk_info.go:125] begin check local disk info of client
I0319 11:58:19.012098 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:58:19.012105 543705 disk_info.go:196] parse disk info done, disk is : [0xc000257300 0xc000257340]
E0319 11:58:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:58:23.409758 543705 memory.go:184] no items to output this cycle
I0319 11:58:23.409790 543705 cpu.go:275] no items to output this cycle
E0319 11:58:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:58:33.409789 543705 memory.go:184] no items to output this cycle
I0319 11:58:33.409806 543705 cpu.go:275] no items to output this cycle
E0319 11:58:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:58:43.409809 543705 memory.go:191] Add success.
I0319 11:58:43.409818 543705 cpu.go:282] Add success.
I0319 11:58:43.420009 543705 net.go:648] Add success.
I0319 11:58:43.422885 543705 net.go:770] primary dev: ETH0
I0319 11:58:43.422898 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:58:43.422911 543705 net.go:698] Add success.
I0319 11:58:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:58:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:58:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:58:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:58:53.409775 543705 memory.go:184] no items to output this cycle
I0319 11:58:53.409776 543705 cpu.go:275] no items to output this cycle
E0319 11:59:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:59:03.409781 543705 memory.go:184] no items to output this cycle
I0319 11:59:03.409803 543705 cpu.go:275] no items to output this cycle
E0319 11:59:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:59:13.409780 543705 memory.go:191] Add success.
I0319 11:59:13.409801 543705 cpu.go:282] Add success.
W0319 11:59:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 11:59:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 11:59:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 11:59:13.420166 543705 net.go:648] Add success.
I0319 11:59:13.423103 543705 net.go:770] primary dev: ETH0
I0319 11:59:13.423116 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:59:13.423128 543705 net.go:698] Add success.
I0319 11:59:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0319 11:59:14.455183 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 11:59:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 11:59:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0319 11:59:14.456579 543705 disk_worker.go:494] system disk:vda1
I0319 11:59:14.456608 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 11:59:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 11:59:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:59:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:59:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 11:59:16.472412 543705 disk_local_worker.go:436] Get disk info: []
I0319 11:59:19.013670 543705 disk_info.go:125] begin check local disk info of client
I0319 11:59:19.016031 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 11:59:19.016037 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034b700 0xc00034b740]
E0319 11:59:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:59:23.409804 543705 memory.go:184] no items to output this cycle
I0319 11:59:23.409815 543705 cpu.go:275] no items to output this cycle
E0319 11:59:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:59:33.409795 543705 memory.go:184] no items to output this cycle
I0319 11:59:33.409798 543705 cpu.go:275] no items to output this cycle
E0319 11:59:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:59:43.409795 543705 memory.go:191] Add success.
I0319 11:59:43.409818 543705 cpu.go:282] Add success.
I0319 11:59:43.419943 543705 net.go:648] Add success.
I0319 11:59:43.423259 543705 net.go:770] primary dev: ETH0
I0319 11:59:43.423272 543705 net.go:802] Send network stats successfully!,count is 6
I0319 11:59:43.423285 543705 net.go:698] Add success.
I0319 11:59:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 11:59:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 11:59:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0319 11:59:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 11:59:53.409777 543705 memory.go:184] no items to output this cycle
I0319 11:59:53.409780 543705 cpu.go:275] no items to output this cycle
E0319 12:00:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:00:03.409785 543705 memory.go:184] no items to output this cycle
I0319 12:00:03.409788 543705 cpu.go:275] no items to output this cycle
E0319 12:00:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:00:13.409814 543705 memory.go:191] Add success.
I0319 12:00:13.409817 543705 cpu.go:282] Add success.
W0319 12:00:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:00:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:00:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:00:13.420050 543705 net.go:648] Add success.
I0319 12:00:13.423398 543705 net.go:770] primary dev: ETH0
I0319 12:00:13.423414 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:00:13.423428 543705 net.go:698] Add success.
I0319 12:00:13.609525 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9287e534-31c3-4efa-a832-afb93ea67566","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 12:00:13.609559 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 12:00:14.453966 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:00:14.455292 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:00:14.455302 543705 disk_worker.go:708] disk space is not compliant
W0319 12:00:14.455305 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:00:14.456793 543705 disk_worker.go:494] system disk:vda1
I0319 12:00:14.456821 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:00:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:00:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:00:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:00:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:00:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:00:19.017670 543705 disk_info.go:125] begin check local disk info of client
I0319 12:00:19.020091 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:00:19.020097 543705 disk_info.go:196] parse disk info done, disk is : [0xc000314bc0 0xc000314c00]
E0319 12:00:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:00:23.409776 543705 memory.go:184] no items to output this cycle
I0319 12:00:23.409795 543705 cpu.go:275] no items to output this cycle
E0319 12:00:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:00:33.409799 543705 memory.go:184] no items to output this cycle
I0319 12:00:33.409805 543705 cpu.go:275] no items to output this cycle
I0319 12:00:37.700360 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 12:00:37.700367 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 12:00:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:00:43.410889 543705 memory.go:191] Add success.
I0319 12:00:43.409820 543705 cpu.go:282] Add success.
I0319 12:00:43.420569 543705 net.go:648] Add success.
I0319 12:00:43.423493 543705 net.go:770] primary dev: ETH0
I0319 12:00:43.423507 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:00:43.423520 543705 net.go:698] Add success.
I0319 12:00:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:00:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:00:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:00:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:00:53.409796 543705 memory.go:184] no items to output this cycle
I0319 12:00:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 12:01:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:01:03.409776 543705 memory.go:184] no items to output this cycle
I0319 12:01:03.409802 543705 cpu.go:275] no items to output this cycle
E0319 12:01:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:01:13.409811 543705 memory.go:191] Add success.
I0319 12:01:13.409822 543705 cpu.go:282] Add success.
W0319 12:01:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:01:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:01:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:01:13.420219 543705 net.go:648] Add success.
I0319 12:01:13.423170 543705 net.go:770] primary dev: ETH0
I0319 12:01:13.423184 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:01:13.423197 543705 net.go:698] Add success.
I0319 12:01:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:01:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:01:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0319 12:01:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:01:14.456518 543705 disk_worker.go:494] system disk:vda1
I0319 12:01:14.456561 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:01:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:01:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:01:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:01:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:01:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:01:19.021671 543705 disk_info.go:125] begin check local disk info of client
I0319 12:01:19.024098 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:01:19.024104 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001efc40 0xc0001efc80]
E0319 12:01:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:01:23.409771 543705 memory.go:184] no items to output this cycle
I0319 12:01:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 12:01:33.409869 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:01:33.409888 543705 memory.go:184] no items to output this cycle
I0319 12:01:33.409921 543705 cpu.go:275] no items to output this cycle
E0319 12:01:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:01:43.409793 543705 memory.go:191] Add success.
I0319 12:01:43.409798 543705 cpu.go:282] Add success.
I0319 12:01:43.419999 543705 net.go:648] Add success.
I0319 12:01:43.422809 543705 net.go:770] primary dev: ETH0
I0319 12:01:43.422823 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:01:43.422837 543705 net.go:698] Add success.
I0319 12:01:46.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:01:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:01:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:01:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:01:53.409775 543705 cpu.go:275] no items to output this cycle
I0319 12:01:53.409779 543705 memory.go:184] no items to output this cycle
E0319 12:02:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:02:03.409783 543705 memory.go:184] no items to output this cycle
I0319 12:02:03.409790 543705 cpu.go:275] no items to output this cycle
E0319 12:02:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:02:13.409780 543705 memory.go:191] Add success.
I0319 12:02:13.409799 543705 cpu.go:282] Add success.
W0319 12:02:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:02:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:02:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:02:13.420335 543705 net.go:648] Add success.
I0319 12:02:13.423123 543705 net.go:770] primary dev: ETH0
I0319 12:02:13.423138 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:02:13.423149 543705 net.go:698] Add success.
W0319 12:02:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:02:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0319 12:02:14.455172 543705 disk_worker.go:728] disk inode is not compliant
E0319 12:02:14.455865 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 12:02:14.455874 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 12:02:14.455880 543705 custom_config.go:64] query custom config with name: gpu
I0319 12:02:14.456607 543705 disk_worker.go:494] system disk:vda1
I0319 12:02:14.456648 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 12:02:15.456828 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 12:02:15.456836 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:02:16.457903 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 12:02:16.457902 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 12:02:16.457959 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:02:16.457978 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:02:16.472301 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:02:19.025669 543705 disk_info.go:125] begin check local disk info of client
I0319 12:02:19.027987 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:02:19.027992 543705 disk_info.go:196] parse disk info done, disk is : [0xc000509880 0xc0005098c0]
E0319 12:02:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:02:23.409787 543705 memory.go:184] no items to output this cycle
I0319 12:02:23.409790 543705 cpu.go:275] no items to output this cycle
E0319 12:02:33.409883 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:02:33.409905 543705 memory.go:184] no items to output this cycle
I0319 12:02:33.409936 543705 cpu.go:275] no items to output this cycle
E0319 12:02:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:02:43.409812 543705 memory.go:191] Add success.
I0319 12:02:43.409829 543705 cpu.go:282] Add success.
I0319 12:02:43.419949 543705 net.go:648] Add success.
I0319 12:02:43.422797 543705 net.go:770] primary dev: ETH0
I0319 12:02:43.422811 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:02:43.422827 543705 net.go:698] Add success.
I0319 12:02:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:02:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:02:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:02:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:02:53.409763 543705 memory.go:184] no items to output this cycle
I0319 12:02:53.409811 543705 cpu.go:275] no items to output this cycle
E0319 12:03:03.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:03:03.409822 543705 memory.go:184] no items to output this cycle
I0319 12:03:03.409834 543705 cpu.go:275] no items to output this cycle
E0319 12:03:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:03:13.409786 543705 memory.go:191] Add success.
W0319 12:03:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:03:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:03:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:03:13.409828 543705 cpu.go:282] Add success.
I0319 12:03:13.420128 543705 net.go:648] Add success.
I0319 12:03:13.423152 543705 net.go:770] primary dev: ETH0
I0319 12:03:13.423165 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:03:13.423177 543705 net.go:698] Add success.
I0319 12:03:13.508275 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"fc90e21a-1d47-4a44-9072-bfc01757b7da","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 12:03:13.508319 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 12:03:14.454980 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:03:14.455204 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:03:14.455215 543705 disk_worker.go:708] disk space is not compliant
W0319 12:03:14.455218 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:03:14.456737 543705 disk_worker.go:494] system disk:vda1
I0319 12:03:14.456769 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:03:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:03:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:03:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:03:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:03:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:03:19.029680 543705 disk_info.go:125] begin check local disk info of client
I0319 12:03:19.032076 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:03:19.032082 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a52c0 0xc0002a5300]
E0319 12:03:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:03:23.409802 543705 memory.go:184] no items to output this cycle
I0319 12:03:23.409810 543705 cpu.go:275] no items to output this cycle
E0319 12:03:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:03:33.409779 543705 memory.go:184] no items to output this cycle
I0319 12:03:33.409782 543705 cpu.go:275] no items to output this cycle
I0319 12:03:37.700507 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 12:03:37.700515 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 12:03:43.409851 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:03:43.411023 543705 memory.go:191] Add success.
I0319 12:03:43.410072 543705 cpu.go:282] Add success.
I0319 12:03:43.419716 543705 net.go:648] Add success.
I0319 12:03:43.422908 543705 net.go:770] primary dev: ETH0
I0319 12:03:43.422921 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:03:43.422934 543705 net.go:698] Add success.
I0319 12:03:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:03:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:03:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:03:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:03:53.409781 543705 memory.go:184] no items to output this cycle
I0319 12:03:53.409785 543705 cpu.go:275] no items to output this cycle
E0319 12:04:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:04:03.409812 543705 memory.go:184] no items to output this cycle
I0319 12:04:03.409826 543705 cpu.go:275] no items to output this cycle
E0319 12:04:13.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:04:13.409779 543705 memory.go:191] Add success.
W0319 12:04:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 12:04:13.409809 543705 cpu.go:282] Add success.
W0319 12:04:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:04:13.409819 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:04:13.420120 543705 net.go:648] Add success.
I0319 12:04:13.422804 543705 net.go:770] primary dev: ETH0
I0319 12:04:13.422820 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:04:13.422834 543705 net.go:698] Add success.
I0319 12:04:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:04:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:04:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0319 12:04:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:04:14.456621 543705 disk_worker.go:494] system disk:vda1
I0319 12:04:14.456652 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:04:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:04:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:04:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:04:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:04:16.472363 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:04:19.033675 543705 disk_info.go:125] begin check local disk info of client
I0319 12:04:19.036122 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:04:19.036130 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b740 0xc00007b780]
E0319 12:04:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:04:23.409801 543705 memory.go:184] no items to output this cycle
I0319 12:04:23.409814 543705 cpu.go:275] no items to output this cycle
E0319 12:04:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:04:33.409783 543705 memory.go:184] no items to output this cycle
I0319 12:04:33.409805 543705 cpu.go:275] no items to output this cycle
E0319 12:04:43.409883 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:04:43.409917 543705 memory.go:191] Add success.
I0319 12:04:43.409947 543705 cpu.go:282] Add success.
I0319 12:04:43.419709 543705 net.go:648] Add success.
I0319 12:04:43.422585 543705 net.go:770] primary dev: ETH0
I0319 12:04:43.422598 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:04:43.422609 543705 net.go:698] Add success.
I0319 12:04:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:04:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:04:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:04:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:04:53.409795 543705 memory.go:184] no items to output this cycle
I0319 12:04:53.409805 543705 cpu.go:275] no items to output this cycle
E0319 12:05:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:05:03.409807 543705 memory.go:184] no items to output this cycle
I0319 12:05:03.409817 543705 cpu.go:275] no items to output this cycle
E0319 12:05:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:05:13.409782 543705 memory.go:191] Add success.
W0319 12:05:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 12:05:13.409813 543705 cpu.go:282] Add success.
W0319 12:05:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:05:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:05:13.420125 543705 net.go:648] Add success.
I0319 12:05:13.422760 543705 net.go:770] primary dev: ETH0
I0319 12:05:13.422773 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:05:13.422784 543705 net.go:698] Add success.
I0319 12:05:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:05:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:05:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 12:05:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:05:14.456571 543705 disk_worker.go:494] system disk:vda1
I0319 12:05:14.456602 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:05:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:05:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:05:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:05:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:05:16.472397 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:05:19.037677 543705 disk_info.go:125] begin check local disk info of client
I0319 12:05:19.040097 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:05:19.040104 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047a100 0xc00047a140]
E0319 12:05:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:05:23.409793 543705 memory.go:184] no items to output this cycle
I0319 12:05:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 12:05:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:05:33.409800 543705 memory.go:184] no items to output this cycle
I0319 12:05:33.409813 543705 cpu.go:275] no items to output this cycle
E0319 12:05:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:05:43.409778 543705 memory.go:191] Add success.
I0319 12:05:43.409809 543705 cpu.go:282] Add success.
I0319 12:05:43.420183 543705 net.go:648] Add success.
I0319 12:05:43.423118 543705 net.go:770] primary dev: ETH0
I0319 12:05:43.423131 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:05:43.423143 543705 net.go:698] Add success.
I0319 12:05:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:05:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:05:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:05:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:05:53.409799 543705 memory.go:184] no items to output this cycle
I0319 12:05:53.409810 543705 cpu.go:275] no items to output this cycle
E0319 12:06:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:06:03.409788 543705 memory.go:184] no items to output this cycle
I0319 12:06:03.409792 543705 cpu.go:275] no items to output this cycle
E0319 12:06:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:06:13.409813 543705 memory.go:191] Add success.
I0319 12:06:13.409825 543705 cpu.go:282] Add success.
W0319 12:06:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:06:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:06:13.409860 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:06:13.420392 543705 net.go:648] Add success.
I0319 12:06:13.422933 543705 net.go:770] primary dev: ETH0
I0319 12:06:13.422946 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:06:13.422958 543705 net.go:698] Add success.
I0319 12:06:13.573918 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3e351b98-be9f-4214-8a60-e57e9887c08f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 12:06:13.573956 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 12:06:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:06:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:06:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0319 12:06:14.455179 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:06:14.456495 543705 disk_worker.go:494] system disk:vda1
I0319 12:06:14.456552 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:06:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:06:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:06:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:06:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:06:16.472372 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:06:19.041673 543705 disk_info.go:125] begin check local disk info of client
I0319 12:06:19.044128 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:06:19.044135 543705 disk_info.go:196] parse disk info done, disk is : [0xc0005096c0 0xc000509700]
E0319 12:06:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:06:23.409760 543705 memory.go:184] no items to output this cycle
I0319 12:06:23.409796 543705 cpu.go:275] no items to output this cycle
E0319 12:06:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:06:33.409810 543705 memory.go:184] no items to output this cycle
I0319 12:06:33.409824 543705 cpu.go:275] no items to output this cycle
I0319 12:06:37.701198 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 12:06:37.701204 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 12:06:43.409848 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:06:43.411021 543705 memory.go:191] Add success.
I0319 12:06:43.409983 543705 cpu.go:282] Add success.
I0319 12:06:43.419730 543705 net.go:648] Add success.
I0319 12:06:43.422879 543705 net.go:770] primary dev: ETH0
I0319 12:06:43.422891 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:06:43.422903 543705 net.go:698] Add success.
I0319 12:06:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:06:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:06:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:06:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:06:53.409772 543705 memory.go:184] no items to output this cycle
I0319 12:06:53.409788 543705 cpu.go:275] no items to output this cycle
E0319 12:07:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:07:03.409778 543705 memory.go:184] no items to output this cycle
I0319 12:07:03.409802 543705 cpu.go:275] no items to output this cycle
E0319 12:07:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:07:13.409778 543705 memory.go:191] Add success.
W0319 12:07:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:07:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:07:13.409819 543705 cpu.go:282] Add success.
I0319 12:07:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:07:13.420250 543705 net.go:648] Add success.
I0319 12:07:13.423050 543705 net.go:770] primary dev: ETH0
I0319 12:07:13.423063 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:07:13.423074 543705 net.go:698] Add success.
I0319 12:07:13.453630 543705 event_worker.go:152] Polling the log file for events...
W0319 12:07:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:07:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0319 12:07:14.455190 543705 disk_worker.go:728] disk inode is not compliant
E0319 12:07:14.455910 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 12:07:14.455919 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 12:07:14.455925 543705 custom_config.go:64] query custom config with name: gpu
I0319 12:07:14.456538 543705 disk_worker.go:494] system disk:vda1
I0319 12:07:14.456569 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 12:07:15.456797 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 12:07:15.456806 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:07:16.458054 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 12:07:16.458065 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 12:07:16.458106 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:07:16.458125 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:07:16.472485 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:07:19.045673 543705 disk_info.go:125] begin check local disk info of client
I0319 12:07:19.048076 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:07:19.048083 543705 disk_info.go:196] parse disk info done, disk is : [0xc000595940 0xc000595980]
E0319 12:07:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:07:23.409795 543705 memory.go:184] no items to output this cycle
I0319 12:07:23.409809 543705 cpu.go:275] no items to output this cycle
E0319 12:07:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:07:33.409779 543705 memory.go:184] no items to output this cycle
I0319 12:07:33.409784 543705 cpu.go:275] no items to output this cycle
E0319 12:07:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:07:43.409792 543705 memory.go:191] Add success.
I0319 12:07:43.409823 543705 cpu.go:282] Add success.
I0319 12:07:43.419758 543705 net.go:648] Add success.
I0319 12:07:43.422525 543705 net.go:770] primary dev: ETH0
I0319 12:07:43.422538 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:07:43.422550 543705 net.go:698] Add success.
I0319 12:07:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:07:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:07:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:07:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:07:53.409774 543705 memory.go:184] no items to output this cycle
I0319 12:07:53.409808 543705 cpu.go:275] no items to output this cycle
E0319 12:08:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:08:03.409783 543705 memory.go:184] no items to output this cycle
I0319 12:08:03.409802 543705 cpu.go:275] no items to output this cycle
E0319 12:08:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:08:13.409807 543705 memory.go:191] Add success.
I0319 12:08:13.409808 543705 cpu.go:282] Add success.
W0319 12:08:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:08:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:08:13.409851 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:08:13.419791 543705 net.go:648] Add success.
I0319 12:08:13.422674 543705 net.go:770] primary dev: ETH0
I0319 12:08:13.422696 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:08:13.422709 543705 net.go:698] Add success.
I0319 12:08:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:08:14.455180 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:08:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0319 12:08:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:08:14.456796 543705 disk_worker.go:494] system disk:vda1
I0319 12:08:14.456826 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:08:15.455974 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:08:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:08:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:08:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:08:16.472475 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:08:19.049672 543705 disk_info.go:125] begin check local disk info of client
I0319 12:08:19.052136 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:08:19.052143 543705 disk_info.go:196] parse disk info done, disk is : [0xc000469200 0xc000469240]
E0319 12:08:23.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:08:23.409805 543705 memory.go:184] no items to output this cycle
I0319 12:08:23.409812 543705 cpu.go:275] no items to output this cycle
E0319 12:08:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:08:33.409782 543705 memory.go:184] no items to output this cycle
I0319 12:08:33.409808 543705 cpu.go:275] no items to output this cycle
E0319 12:08:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:08:43.409798 543705 memory.go:191] Add success.
I0319 12:08:43.409803 543705 cpu.go:282] Add success.
I0319 12:08:43.419892 543705 net.go:648] Add success.
I0319 12:08:43.422618 543705 net.go:770] primary dev: ETH0
I0319 12:08:43.422631 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:08:43.422643 543705 net.go:698] Add success.
I0319 12:08:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:08:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:08:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:08:53.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:08:53.409810 543705 memory.go:184] no items to output this cycle
I0319 12:08:53.409822 543705 cpu.go:275] no items to output this cycle
E0319 12:09:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:09:03.409807 543705 memory.go:184] no items to output this cycle
I0319 12:09:03.409806 543705 cpu.go:275] no items to output this cycle
E0319 12:09:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:09:13.409824 543705 memory.go:191] Add success.
I0319 12:09:13.409831 543705 cpu.go:282] Add success.
W0319 12:09:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:09:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:09:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:09:13.420294 543705 net.go:648] Add success.
I0319 12:09:13.422981 543705 net.go:770] primary dev: ETH0
I0319 12:09:13.422994 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:09:13.423006 543705 net.go:698] Add success.
I0319 12:09:13.617519 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b73645ce-8beb-4713-b469-fdade4da1721","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 12:09:13.617553 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 12:09:14.453984 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:09:14.454202 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:09:14.454213 543705 disk_worker.go:708] disk space is not compliant
W0319 12:09:14.454216 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:09:14.455578 543705 disk_worker.go:494] system disk:vda1
I0319 12:09:14.455629 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:09:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:09:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:09:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:09:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:09:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:09:19.053672 543705 disk_info.go:125] begin check local disk info of client
I0319 12:09:19.056080 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:09:19.056086 543705 disk_info.go:196] parse disk info done, disk is : [0xc00049c280 0xc00049c2c0]
E0319 12:09:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:09:23.409789 543705 cpu.go:275] no items to output this cycle
I0319 12:09:23.409790 543705 memory.go:184] no items to output this cycle
E0319 12:09:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:09:33.409806 543705 memory.go:184] no items to output this cycle
I0319 12:09:33.409822 543705 cpu.go:275] no items to output this cycle
I0319 12:09:37.701734 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 12:09:37.701742 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 12:09:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:09:43.410759 543705 memory.go:191] Add success.
I0319 12:09:43.409810 543705 cpu.go:282] Add success.
I0319 12:09:43.420549 543705 net.go:648] Add success.
I0319 12:09:43.423966 543705 net.go:770] primary dev: ETH0
I0319 12:09:43.423980 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:09:43.423993 543705 net.go:698] Add success.
I0319 12:09:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:09:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:09:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:09:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:09:53.409791 543705 memory.go:184] no items to output this cycle
I0319 12:09:53.409796 543705 cpu.go:275] no items to output this cycle
E0319 12:10:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:10:03.409774 543705 memory.go:184] no items to output this cycle
I0319 12:10:03.409802 543705 cpu.go:275] no items to output this cycle
E0319 12:10:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:10:13.409782 543705 memory.go:191] Add success.
W0319 12:10:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:10:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:10:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:10:13.409834 543705 cpu.go:282] Add success.
I0319 12:10:13.420065 543705 net.go:648] Add success.
I0319 12:10:13.422701 543705 net.go:770] primary dev: ETH0
I0319 12:10:13.422717 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:10:13.422733 543705 net.go:698] Add success.
I0319 12:10:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:10:14.455146 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:10:14.455155 543705 disk_worker.go:708] disk space is not compliant
W0319 12:10:14.455158 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:10:14.456475 543705 disk_worker.go:494] system disk:vda1
I0319 12:10:14.456519 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:10:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:10:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:10:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:10:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:10:16.472361 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:10:19.057678 543705 disk_info.go:125] begin check local disk info of client
I0319 12:10:19.060056 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:10:19.060061 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003d8a00 0xc0003d8a40]
E0319 12:10:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:10:23.409801 543705 memory.go:184] no items to output this cycle
I0319 12:10:23.409813 543705 cpu.go:275] no items to output this cycle
E0319 12:10:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:10:33.409800 543705 memory.go:184] no items to output this cycle
I0319 12:10:33.409800 543705 cpu.go:275] no items to output this cycle
E0319 12:10:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:10:43.409802 543705 memory.go:191] Add success.
I0319 12:10:43.409805 543705 cpu.go:282] Add success.
I0319 12:10:43.419899 543705 net.go:648] Add success.
I0319 12:10:43.422826 543705 net.go:770] primary dev: ETH0
I0319 12:10:43.422842 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:10:43.422855 543705 net.go:698] Add success.
I0319 12:10:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:10:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:10:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:10:53.409886 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:10:53.409903 543705 memory.go:184] no items to output this cycle
I0319 12:10:53.409919 543705 cpu.go:275] no items to output this cycle
E0319 12:11:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:11:03.409800 543705 memory.go:184] no items to output this cycle
I0319 12:11:03.409809 543705 cpu.go:275] no items to output this cycle
E0319 12:11:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:11:13.409832 543705 memory.go:191] Add success.
I0319 12:11:13.409835 543705 cpu.go:282] Add success.
W0319 12:11:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:11:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:11:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:11:13.420164 543705 net.go:648] Add success.
I0319 12:11:13.422817 543705 net.go:770] primary dev: ETH0
I0319 12:11:13.422832 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:11:13.422845 543705 net.go:698] Add success.
I0319 12:11:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:11:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:11:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0319 12:11:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:11:14.456497 543705 disk_worker.go:494] system disk:vda1
I0319 12:11:14.456539 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:11:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:11:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:11:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:11:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:11:16.472444 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:11:19.061674 543705 disk_info.go:125] begin check local disk info of client
I0319 12:11:19.064051 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:11:19.064057 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c59c0 0xc0000c5a00]
E0319 12:11:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:11:23.409802 543705 memory.go:184] no items to output this cycle
I0319 12:11:23.409810 543705 cpu.go:275] no items to output this cycle
E0319 12:11:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:11:33.409789 543705 memory.go:184] no items to output this cycle
I0319 12:11:33.409811 543705 cpu.go:275] no items to output this cycle
E0319 12:11:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:11:43.409806 543705 memory.go:191] Add success.
I0319 12:11:43.409807 543705 cpu.go:282] Add success.
I0319 12:11:43.419873 543705 net.go:648] Add success.
I0319 12:11:43.422666 543705 net.go:770] primary dev: ETH0
I0319 12:11:43.422681 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:11:43.422696 543705 net.go:698] Add success.
I0319 12:11:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:11:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:11:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:11:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:11:53.409804 543705 memory.go:184] no items to output this cycle
I0319 12:11:53.409815 543705 cpu.go:275] no items to output this cycle
E0319 12:12:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:12:03.409806 543705 memory.go:184] no items to output this cycle
I0319 12:12:03.409808 543705 cpu.go:275] no items to output this cycle
E0319 12:12:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:12:13.409824 543705 memory.go:191] Add success.
I0319 12:12:13.409842 543705 cpu.go:282] Add success.
W0319 12:12:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:12:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:12:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:12:13.420220 543705 net.go:648] Add success.
I0319 12:12:13.423130 543705 net.go:770] primary dev: ETH0
I0319 12:12:13.423143 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:12:13.423156 543705 net.go:698] Add success.
I0319 12:12:13.468961 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b9821d09-e7a7-430f-bb0f-13343796a91d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 12:12:13.468995 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 12:12:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:12:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0319 12:12:14.455173 543705 disk_worker.go:728] disk inode is not compliant
E0319 12:12:14.456011 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 12:12:14.456020 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 12:12:14.456025 543705 custom_config.go:64] query custom config with name: gpu
I0319 12:12:14.456453 543705 disk_worker.go:494] system disk:vda1
I0319 12:12:14.456483 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 12:12:15.456817 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 12:12:15.456825 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:12:16.457908 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 12:12:16.457907 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 12:12:16.457963 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:12:16.457982 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:12:16.472308 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:12:19.065673 543705 disk_info.go:125] begin check local disk info of client
I0319 12:12:19.068050 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:12:19.068056 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b980 0xc00007b9c0]
E0319 12:12:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:12:23.409782 543705 memory.go:184] no items to output this cycle
I0319 12:12:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 12:12:33.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:12:33.409818 543705 memory.go:184] no items to output this cycle
I0319 12:12:33.409837 543705 cpu.go:275] no items to output this cycle
I0319 12:12:37.701884 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 12:12:37.701891 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 12:12:43.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:12:43.410802 543705 memory.go:191] Add success.
I0319 12:12:43.409816 543705 cpu.go:282] Add success.
I0319 12:12:43.420514 543705 net.go:648] Add success.
I0319 12:12:43.423368 543705 net.go:770] primary dev: ETH0
I0319 12:12:43.423381 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:12:43.423394 543705 net.go:698] Add success.
I0319 12:12:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:12:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:12:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:12:53.409878 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:12:53.409901 543705 memory.go:184] no items to output this cycle
I0319 12:12:53.410035 543705 cpu.go:275] no items to output this cycle
E0319 12:13:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:13:03.409772 543705 memory.go:184] no items to output this cycle
I0319 12:13:03.409816 543705 cpu.go:275] no items to output this cycle
E0319 12:13:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:13:13.409803 543705 memory.go:191] Add success.
I0319 12:13:13.409813 543705 cpu.go:282] Add success.
W0319 12:13:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:13:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:13:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:13:13.420287 543705 net.go:648] Add success.
I0319 12:13:13.423039 543705 net.go:770] primary dev: ETH0
I0319 12:13:13.423051 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:13:13.423063 543705 net.go:698] Add success.
I0319 12:13:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:13:14.455108 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:13:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0319 12:13:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:13:14.456543 543705 disk_worker.go:494] system disk:vda1
I0319 12:13:14.456574 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:13:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:13:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:13:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:13:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:13:16.472363 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:13:19.069672 543705 disk_info.go:125] begin check local disk info of client
I0319 12:13:19.072070 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:13:19.072077 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aaa00 0xc0001aaa40]
E0319 12:13:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:13:23.409773 543705 memory.go:184] no items to output this cycle
I0319 12:13:23.409778 543705 cpu.go:275] no items to output this cycle
E0319 12:13:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:13:33.409770 543705 memory.go:184] no items to output this cycle
I0319 12:13:33.409795 543705 cpu.go:275] no items to output this cycle
E0319 12:13:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:13:43.409813 543705 memory.go:191] Add success.
I0319 12:13:43.409821 543705 cpu.go:282] Add success.
I0319 12:13:43.420420 543705 net.go:648] Add success.
I0319 12:13:43.423160 543705 net.go:770] primary dev: ETH0
I0319 12:13:43.423176 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:13:43.423337 543705 net.go:698] Add success.
I0319 12:13:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:13:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:13:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:13:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:13:53.409797 543705 memory.go:184] no items to output this cycle
I0319 12:13:53.409809 543705 cpu.go:275] no items to output this cycle
E0319 12:14:03.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:14:03.409812 543705 memory.go:184] no items to output this cycle
I0319 12:14:03.409824 543705 cpu.go:275] no items to output this cycle
E0319 12:14:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:14:13.409789 543705 memory.go:191] Add success.
W0319 12:14:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 12:14:13.409816 543705 cpu.go:282] Add success.
W0319 12:14:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:14:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:14:13.420285 543705 net.go:648] Add success.
I0319 12:14:13.423141 543705 net.go:770] primary dev: ETH0
I0319 12:14:13.423155 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:14:13.423170 543705 net.go:698] Add success.
I0319 12:14:14.454948 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:14:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:14:14.455164 543705 disk_worker.go:708] disk space is not compliant
W0319 12:14:14.455167 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:14:14.456485 543705 disk_worker.go:494] system disk:vda1
I0319 12:14:14.456529 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:14:15.455974 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:14:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:14:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:14:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:14:16.472388 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:14:19.073677 543705 disk_info.go:125] begin check local disk info of client
I0319 12:14:19.076027 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:14:19.076033 543705 disk_info.go:196] parse disk info done, disk is : [0xc000486300 0xc000486340]
E0319 12:14:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:14:23.409789 543705 memory.go:184] no items to output this cycle
I0319 12:14:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 12:14:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:14:33.409782 543705 memory.go:184] no items to output this cycle
I0319 12:14:33.409804 543705 cpu.go:275] no items to output this cycle
E0319 12:14:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:14:43.409780 543705 memory.go:191] Add success.
I0319 12:14:43.409794 543705 cpu.go:282] Add success.
I0319 12:14:43.420045 543705 net.go:648] Add success.
I0319 12:14:43.421017 543705 net.go:770] primary dev: ETH0
I0319 12:14:43.421029 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:14:43.421041 543705 net.go:698] Add success.
I0319 12:14:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:14:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:14:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:14:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:14:53.409786 543705 memory.go:184] no items to output this cycle
I0319 12:14:53.409787 543705 cpu.go:275] no items to output this cycle
E0319 12:15:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:15:03.409789 543705 memory.go:184] no items to output this cycle
I0319 12:15:03.409803 543705 cpu.go:275] no items to output this cycle
E0319 12:15:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:15:13.409779 543705 memory.go:191] Add success.
I0319 12:15:13.409802 543705 cpu.go:282] Add success.
W0319 12:15:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:15:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:15:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:15:13.420259 543705 net.go:648] Add success.
I0319 12:15:13.422941 543705 net.go:770] primary dev: ETH0
I0319 12:15:13.422954 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:15:13.422965 543705 net.go:698] Add success.
I0319 12:15:13.463555 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"cf0b45c5-f413-4f71-bf12-20d9ee1207e7","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 12:15:13.463598 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 12:15:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:15:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:15:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0319 12:15:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:15:14.456551 543705 disk_worker.go:494] system disk:vda1
I0319 12:15:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:15:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:15:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:15:16.458025 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:15:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:15:16.472394 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:15:19.077671 543705 disk_info.go:125] begin check local disk info of client
I0319 12:15:19.080094 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:15:19.080101 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ef080 0xc0001ef0c0]
E0319 12:15:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:15:23.409787 543705 memory.go:184] no items to output this cycle
I0319 12:15:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 12:15:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:15:33.409791 543705 memory.go:184] no items to output this cycle
I0319 12:15:33.409818 543705 cpu.go:275] no items to output this cycle
I0319 12:15:37.702031 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 12:15:37.702039 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 12:15:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:15:43.410726 543705 memory.go:191] Add success.
I0319 12:15:43.409793 543705 cpu.go:282] Add success.
I0319 12:15:43.420601 543705 net.go:648] Add success.
I0319 12:15:43.424590 543705 net.go:770] primary dev: ETH0
I0319 12:15:43.424603 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:15:43.424615 543705 net.go:698] Add success.
I0319 12:15:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:15:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:15:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:15:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:15:53.409769 543705 memory.go:184] no items to output this cycle
I0319 12:15:53.409789 543705 cpu.go:275] no items to output this cycle
E0319 12:16:03.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:16:03.409814 543705 memory.go:184] no items to output this cycle
I0319 12:16:03.409827 543705 cpu.go:275] no items to output this cycle
E0319 12:16:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:16:13.409796 543705 cpu.go:282] Add success.
I0319 12:16:13.409802 543705 memory.go:191] Add success.
W0319 12:16:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:16:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:16:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:16:13.420182 543705 net.go:648] Add success.
I0319 12:16:13.423215 543705 net.go:770] primary dev: ETH0
I0319 12:16:13.423229 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:16:13.423242 543705 net.go:698] Add success.
I0319 12:16:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:16:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:16:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0319 12:16:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:16:14.456593 543705 disk_worker.go:494] system disk:vda1
I0319 12:16:14.456626 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:16:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:16:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:16:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:16:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:16:16.472399 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:16:19.081671 543705 disk_info.go:125] begin check local disk info of client
I0319 12:16:19.084049 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:16:19.084055 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003de900 0xc0003de940]
E0319 12:16:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:16:23.409795 543705 memory.go:184] no items to output this cycle
I0319 12:16:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 12:16:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:16:33.409786 543705 memory.go:184] no items to output this cycle
I0319 12:16:33.409811 543705 cpu.go:275] no items to output this cycle
E0319 12:16:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:16:43.409789 543705 memory.go:191] Add success.
I0319 12:16:43.409793 543705 cpu.go:282] Add success.
I0319 12:16:43.420189 543705 net.go:648] Add success.
I0319 12:16:43.423136 543705 net.go:770] primary dev: ETH0
I0319 12:16:43.423151 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:16:43.423173 543705 net.go:698] Add success.
I0319 12:16:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:16:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:16:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:16:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:16:53.409778 543705 memory.go:184] no items to output this cycle
I0319 12:16:53.409784 543705 cpu.go:275] no items to output this cycle
E0319 12:17:03.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:17:03.409810 543705 memory.go:184] no items to output this cycle
I0319 12:17:03.409824 543705 cpu.go:275] no items to output this cycle
E0319 12:17:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:17:13.409816 543705 memory.go:191] Add success.
I0319 12:17:13.409828 543705 cpu.go:282] Add success.
W0319 12:17:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:17:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:17:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:17:13.420153 543705 net.go:648] Add success.
I0319 12:17:13.422958 543705 net.go:770] primary dev: ETH0
I0319 12:17:13.422972 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:17:13.422983 543705 net.go:698] Add success.
I0319 12:17:13.453521 543705 event_worker.go:152] Polling the log file for events...
W0319 12:17:14.455119 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:17:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0319 12:17:14.455183 543705 disk_worker.go:728] disk inode is not compliant
E0319 12:17:14.455921 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 12:17:14.455930 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 12:17:14.455936 543705 custom_config.go:64] query custom config with name: gpu
I0319 12:17:14.456562 543705 disk_worker.go:494] system disk:vda1
I0319 12:17:14.456590 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 12:17:15.456798 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 12:17:15.456808 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:17:16.457896 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 12:17:16.457895 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 12:17:16.457947 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:17:16.457966 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:17:16.472285 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:17:19.085669 543705 disk_info.go:125] begin check local disk info of client
I0319 12:17:19.088004 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:17:19.088012 543705 disk_info.go:196] parse disk info done, disk is : [0xc000251b80 0xc000251bc0]
E0319 12:17:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:17:23.409779 543705 memory.go:184] no items to output this cycle
I0319 12:17:23.409788 543705 cpu.go:275] no items to output this cycle
E0319 12:17:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:17:33.409765 543705 memory.go:184] no items to output this cycle
I0319 12:17:33.409813 543705 cpu.go:275] no items to output this cycle
E0319 12:17:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:17:43.409803 543705 memory.go:191] Add success.
I0319 12:17:43.409805 543705 cpu.go:282] Add success.
I0319 12:17:43.419732 543705 net.go:648] Add success.
I0319 12:17:43.422459 543705 net.go:770] primary dev: ETH0
I0319 12:17:43.422473 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:17:43.422484 543705 net.go:698] Add success.
I0319 12:17:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:17:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:17:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:17:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:17:53.409781 543705 memory.go:184] no items to output this cycle
I0319 12:17:53.409783 543705 cpu.go:275] no items to output this cycle
E0319 12:18:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:18:03.409783 543705 memory.go:184] no items to output this cycle
I0319 12:18:03.409809 543705 cpu.go:275] no items to output this cycle
E0319 12:18:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:18:13.409784 543705 memory.go:191] Add success.
W0319 12:18:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:18:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:18:13.409822 543705 cpu.go:282] Add success.
I0319 12:18:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:18:13.420162 543705 net.go:648] Add success.
I0319 12:18:13.422829 543705 net.go:770] primary dev: ETH0
I0319 12:18:13.422844 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:18:13.422860 543705 net.go:698] Add success.
I0319 12:18:13.464071 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2eed5528-3ce8-46a5-b6c1-d62d67137e12","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 12:18:13.464107 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 12:18:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:18:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:18:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0319 12:18:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:18:14.456595 543705 disk_worker.go:494] system disk:vda1
I0319 12:18:14.456625 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:18:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:18:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:18:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:18:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:18:16.472470 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:18:19.089668 543705 disk_info.go:125] begin check local disk info of client
I0319 12:18:19.092134 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:18:19.092141 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ef080 0xc0003ef0c0]
E0319 12:18:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:18:23.409766 543705 memory.go:184] no items to output this cycle
I0319 12:18:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 12:18:33.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:18:33.409813 543705 memory.go:184] no items to output this cycle
I0319 12:18:33.409826 543705 cpu.go:275] no items to output this cycle
I0319 12:18:37.702195 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 12:18:37.702201 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 12:18:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:18:43.410799 543705 memory.go:191] Add success.
I0319 12:18:43.409802 543705 cpu.go:282] Add success.
I0319 12:18:43.420564 543705 net.go:648] Add success.
I0319 12:18:43.423559 543705 net.go:770] primary dev: ETH0
I0319 12:18:43.423574 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:18:43.423589 543705 net.go:698] Add success.
I0319 12:18:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:18:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:18:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:18:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:18:53.409776 543705 memory.go:184] no items to output this cycle
I0319 12:18:53.409779 543705 cpu.go:275] no items to output this cycle
E0319 12:19:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:19:03.409780 543705 memory.go:184] no items to output this cycle
I0319 12:19:03.409799 543705 cpu.go:275] no items to output this cycle
E0319 12:19:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:19:13.409820 543705 memory.go:191] Add success.
I0319 12:19:13.409830 543705 cpu.go:282] Add success.
W0319 12:19:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:19:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:19:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:19:13.420368 543705 net.go:648] Add success.
I0319 12:19:13.423412 543705 net.go:770] primary dev: ETH0
I0319 12:19:13.423426 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:19:13.423441 543705 net.go:698] Add success.
I0319 12:19:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:19:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:19:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0319 12:19:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:19:14.456562 543705 disk_worker.go:494] system disk:vda1
I0319 12:19:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:19:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:19:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:19:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:19:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:19:16.472407 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:19:19.093673 543705 disk_info.go:125] begin check local disk info of client
I0319 12:19:19.096061 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:19:19.096067 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003deb80 0xc0003debc0]
E0319 12:19:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:19:23.409772 543705 memory.go:184] no items to output this cycle
I0319 12:19:23.409776 543705 cpu.go:275] no items to output this cycle
E0319 12:19:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:19:33.409771 543705 memory.go:184] no items to output this cycle
I0319 12:19:33.409801 543705 cpu.go:275] no items to output this cycle
E0319 12:19:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:19:43.409793 543705 memory.go:191] Add success.
I0319 12:19:43.409825 543705 cpu.go:282] Add success.
I0319 12:19:43.420037 543705 net.go:648] Add success.
I0319 12:19:43.422967 543705 net.go:770] primary dev: ETH0
I0319 12:19:43.422979 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:19:43.422991 543705 net.go:698] Add success.
I0319 12:19:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:19:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:19:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:19:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:19:53.409794 543705 memory.go:184] no items to output this cycle
I0319 12:19:53.409806 543705 cpu.go:275] no items to output this cycle
E0319 12:20:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:20:03.409789 543705 memory.go:184] no items to output this cycle
I0319 12:20:03.409804 543705 cpu.go:275] no items to output this cycle
E0319 12:20:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:20:13.409829 543705 memory.go:191] Add success.
I0319 12:20:13.409842 543705 cpu.go:282] Add success.
W0319 12:20:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:20:13.409880 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:20:13.409885 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:20:13.420223 543705 net.go:648] Add success.
I0319 12:20:13.423230 543705 net.go:770] primary dev: ETH0
I0319 12:20:13.423245 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:20:13.423259 543705 net.go:698] Add success.
I0319 12:20:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:20:14.455196 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:20:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0319 12:20:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:20:14.456616 543705 disk_worker.go:494] system disk:vda1
I0319 12:20:14.456649 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:20:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:20:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:20:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:20:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:20:16.472391 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:20:19.097671 543705 disk_info.go:125] begin check local disk info of client
I0319 12:20:19.100061 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:20:19.100066 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003cfb00 0xc0003cfb40]
E0319 12:20:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:20:23.409797 543705 memory.go:184] no items to output this cycle
I0319 12:20:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 12:20:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:20:33.409790 543705 memory.go:184] no items to output this cycle
I0319 12:20:33.409798 543705 cpu.go:275] no items to output this cycle
E0319 12:20:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:20:43.409817 543705 memory.go:191] Add success.
I0319 12:20:43.409825 543705 cpu.go:282] Add success.
I0319 12:20:43.420007 543705 net.go:648] Add success.
I0319 12:20:43.422882 543705 net.go:770] primary dev: ETH0
I0319 12:20:43.422897 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:20:43.422911 543705 net.go:698] Add success.
I0319 12:20:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:20:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:20:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:20:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:20:53.409779 543705 memory.go:184] no items to output this cycle
I0319 12:20:53.409783 543705 cpu.go:275] no items to output this cycle
E0319 12:21:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:21:03.409778 543705 memory.go:184] no items to output this cycle
I0319 12:21:03.409811 543705 cpu.go:275] no items to output this cycle
E0319 12:21:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:21:13.409779 543705 memory.go:191] Add success.
W0319 12:21:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 12:21:13.409810 543705 cpu.go:282] Add success.
W0319 12:21:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:21:13.409819 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:21:13.419688 543705 net.go:648] Add success.
I0319 12:21:13.422445 543705 net.go:770] primary dev: ETH0
I0319 12:21:13.422458 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:21:13.422472 543705 net.go:698] Add success.
I0319 12:21:13.463426 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9ae01966-2b32-45b4-ba46-6d6a1faa0f27","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 12:21:13.463460 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 12:21:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:21:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:21:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0319 12:21:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:21:14.456629 543705 disk_worker.go:494] system disk:vda1
I0319 12:21:14.456660 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:21:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:21:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:21:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:21:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:21:16.472366 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:21:19.101673 543705 disk_info.go:125] begin check local disk info of client
I0319 12:21:19.104072 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:21:19.104079 543705 disk_info.go:196] parse disk info done, disk is : [0xc000382d80 0xc000382dc0]
E0319 12:21:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:21:23.409793 543705 memory.go:184] no items to output this cycle
I0319 12:21:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 12:21:33.409867 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:21:33.409913 543705 cpu.go:275] no items to output this cycle
I0319 12:21:33.409987 543705 memory.go:184] no items to output this cycle
I0319 12:21:37.702335 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 12:21:37.702343 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 12:21:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:21:43.410699 543705 memory.go:191] Add success.
I0319 12:21:43.409829 543705 cpu.go:282] Add success.
I0319 12:21:43.420483 543705 net.go:648] Add success.
I0319 12:21:43.423280 543705 net.go:770] primary dev: ETH0
I0319 12:21:43.423295 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:21:43.423312 543705 net.go:698] Add success.
I0319 12:21:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:21:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:21:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:21:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:21:53.409782 543705 memory.go:184] no items to output this cycle
I0319 12:21:53.409785 543705 cpu.go:275] no items to output this cycle
E0319 12:22:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:22:03.409809 543705 memory.go:184] no items to output this cycle
I0319 12:22:03.409818 543705 cpu.go:275] no items to output this cycle
E0319 12:22:13.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:22:13.409777 543705 memory.go:191] Add success.
I0319 12:22:13.409797 543705 cpu.go:282] Add success.
W0319 12:22:13.409802 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:22:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:22:13.409816 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:22:13.420145 543705 net.go:648] Add success.
I0319 12:22:13.422959 543705 net.go:770] primary dev: ETH0
I0319 12:22:13.422974 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:22:13.422989 543705 net.go:698] Add success.
W0319 12:22:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:22:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0319 12:22:14.455192 543705 disk_worker.go:728] disk inode is not compliant
E0319 12:22:14.455875 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 12:22:14.455884 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 12:22:14.455889 543705 custom_config.go:64] query custom config with name: gpu
I0319 12:22:14.456551 543705 disk_worker.go:494] system disk:vda1
I0319 12:22:14.456581 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 12:22:15.456844 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 12:22:15.456852 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:22:16.457927 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 12:22:16.457927 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 12:22:16.457981 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:22:16.458003 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:22:16.472339 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:22:19.105673 543705 disk_info.go:125] begin check local disk info of client
I0319 12:22:19.108033 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:22:19.108039 543705 disk_info.go:196] parse disk info done, disk is : [0xc000272400 0xc000272440]
E0319 12:22:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:22:23.409794 543705 memory.go:184] no items to output this cycle
I0319 12:22:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 12:22:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:22:33.409910 543705 memory.go:184] no items to output this cycle
I0319 12:22:33.409964 543705 cpu.go:275] no items to output this cycle
E0319 12:22:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:22:43.409796 543705 cpu.go:282] Add success.
I0319 12:22:43.409803 543705 memory.go:191] Add success.
I0319 12:22:43.419908 543705 net.go:648] Add success.
I0319 12:22:43.422685 543705 net.go:770] primary dev: ETH0
I0319 12:22:43.422698 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:22:43.422709 543705 net.go:698] Add success.
I0319 12:22:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:22:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:22:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:22:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:22:53.409768 543705 memory.go:184] no items to output this cycle
I0319 12:22:53.409804 543705 cpu.go:275] no items to output this cycle
E0319 12:23:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:23:03.409784 543705 memory.go:184] no items to output this cycle
I0319 12:23:03.409807 543705 cpu.go:275] no items to output this cycle
E0319 12:23:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:23:13.409814 543705 memory.go:191] Add success.
I0319 12:23:13.409814 543705 cpu.go:282] Add success.
W0319 12:23:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:23:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:23:13.409855 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:23:13.420134 543705 net.go:648] Add success.
I0319 12:23:13.423177 543705 net.go:770] primary dev: ETH0
I0319 12:23:13.423193 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:23:13.423208 543705 net.go:698] Add success.
I0319 12:23:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:23:14.455137 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:23:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0319 12:23:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:23:14.456589 543705 disk_worker.go:494] system disk:vda1
I0319 12:23:14.456621 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:23:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:23:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:23:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:23:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:23:16.472437 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:23:19.109670 543705 disk_info.go:125] begin check local disk info of client
I0319 12:23:19.111961 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:23:19.111967 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035bb80 0xc00035bbc0]
E0319 12:23:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:23:23.409769 543705 memory.go:184] no items to output this cycle
I0319 12:23:23.409790 543705 cpu.go:275] no items to output this cycle
E0319 12:23:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:23:33.409774 543705 memory.go:184] no items to output this cycle
I0319 12:23:33.409785 543705 cpu.go:275] no items to output this cycle
E0319 12:23:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:23:43.409828 543705 memory.go:191] Add success.
I0319 12:23:43.409838 543705 cpu.go:282] Add success.
I0319 12:23:43.419979 543705 net.go:648] Add success.
I0319 12:23:43.422941 543705 net.go:770] primary dev: ETH0
I0319 12:23:43.422956 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:23:43.422975 543705 net.go:698] Add success.
I0319 12:23:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:23:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:23:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:23:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:23:53.409799 543705 memory.go:184] no items to output this cycle
I0319 12:23:53.409810 543705 cpu.go:275] no items to output this cycle
E0319 12:24:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:24:03.409783 543705 memory.go:184] no items to output this cycle
I0319 12:24:03.409791 543705 cpu.go:275] no items to output this cycle
E0319 12:24:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:24:13.409790 543705 memory.go:191] Add success.
I0319 12:24:13.409808 543705 cpu.go:282] Add success.
W0319 12:24:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:24:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:24:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:24:13.420061 543705 net.go:648] Add success.
I0319 12:24:13.422931 543705 net.go:770] primary dev: ETH0
I0319 12:24:13.422945 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:24:13.422959 543705 net.go:698] Add success.
I0319 12:24:13.480209 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"966e98ff-eff3-4044-8d68-53b1fbe217bd","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 12:24:13.480244 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 12:24:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:24:14.455162 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:24:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0319 12:24:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:24:14.456636 543705 disk_worker.go:494] system disk:vda1
I0319 12:24:14.456664 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:24:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:24:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:24:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:24:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:24:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:24:19.113671 543705 disk_info.go:125] begin check local disk info of client
I0319 12:24:19.116058 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:24:19.116064 543705 disk_info.go:196] parse disk info done, disk is : [0xc000252c00 0xc000252c40]
E0319 12:24:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:24:23.409794 543705 memory.go:184] no items to output this cycle
I0319 12:24:23.409806 543705 cpu.go:275] no items to output this cycle
E0319 12:24:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:24:33.409782 543705 memory.go:184] no items to output this cycle
I0319 12:24:33.409795 543705 cpu.go:275] no items to output this cycle
I0319 12:24:37.703221 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 12:24:37.703227 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 12:24:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:24:43.410809 543705 memory.go:191] Add success.
I0319 12:24:43.409805 543705 cpu.go:282] Add success.
I0319 12:24:43.420500 543705 net.go:648] Add success.
I0319 12:24:43.423349 543705 net.go:770] primary dev: ETH0
I0319 12:24:43.423365 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:24:43.423380 543705 net.go:698] Add success.
I0319 12:24:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:24:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:24:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:24:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:24:53.409799 543705 memory.go:184] no items to output this cycle
I0319 12:24:53.409812 543705 cpu.go:275] no items to output this cycle
E0319 12:25:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:25:03.409794 543705 memory.go:184] no items to output this cycle
I0319 12:25:03.409800 543705 cpu.go:275] no items to output this cycle
E0319 12:25:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:25:13.409790 543705 memory.go:191] Add success.
I0319 12:25:13.409811 543705 cpu.go:282] Add success.
W0319 12:25:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:25:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:25:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:25:13.420168 543705 net.go:648] Add success.
I0319 12:25:13.422975 543705 net.go:770] primary dev: ETH0
I0319 12:25:13.422989 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:25:13.423003 543705 net.go:698] Add success.
I0319 12:25:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:25:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:25:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0319 12:25:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:25:14.456558 543705 disk_worker.go:494] system disk:vda1
I0319 12:25:14.456587 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:25:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:25:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:25:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:25:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:25:16.472376 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:25:19.117669 543705 disk_info.go:125] begin check local disk info of client
I0319 12:25:19.120011 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:25:19.120017 543705 disk_info.go:196] parse disk info done, disk is : [0xc000307000 0xc000307040]
E0319 12:25:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:25:23.409801 543705 memory.go:184] no items to output this cycle
I0319 12:25:23.409814 543705 cpu.go:275] no items to output this cycle
E0319 12:25:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:25:33.409784 543705 memory.go:184] no items to output this cycle
I0319 12:25:33.409790 543705 cpu.go:275] no items to output this cycle
E0319 12:25:43.409891 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:25:43.409920 543705 memory.go:191] Add success.
I0319 12:25:43.409932 543705 cpu.go:282] Add success.
I0319 12:25:43.419712 543705 net.go:648] Add success.
I0319 12:25:43.422479 543705 net.go:770] primary dev: ETH0
I0319 12:25:43.422492 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:25:43.422503 543705 net.go:698] Add success.
I0319 12:25:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:25:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:25:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:25:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:25:53.409774 543705 memory.go:184] no items to output this cycle
I0319 12:25:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 12:26:03.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:26:03.409812 543705 memory.go:184] no items to output this cycle
I0319 12:26:03.409828 543705 cpu.go:275] no items to output this cycle
E0319 12:26:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:26:13.409772 543705 memory.go:191] Add success.
W0319 12:26:13.409797 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:26:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:26:13.409809 543705 cpu.go:282] Add success.
I0319 12:26:13.409810 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:26:13.420112 543705 net.go:648] Add success.
I0319 12:26:13.422828 543705 net.go:770] primary dev: ETH0
I0319 12:26:13.422843 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:26:13.422856 543705 net.go:698] Add success.
I0319 12:26:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:26:14.455207 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:26:14.455217 543705 disk_worker.go:708] disk space is not compliant
W0319 12:26:14.455219 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:26:14.456598 543705 disk_worker.go:494] system disk:vda1
I0319 12:26:14.456628 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:26:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:26:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:26:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:26:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:26:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:26:19.121673 543705 disk_info.go:125] begin check local disk info of client
I0319 12:26:19.123988 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:26:19.123994 543705 disk_info.go:196] parse disk info done, disk is : [0xc000306d80 0xc000306dc0]
E0319 12:26:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:26:23.409795 543705 memory.go:184] no items to output this cycle
I0319 12:26:23.409806 543705 cpu.go:275] no items to output this cycle
E0319 12:26:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:26:33.409809 543705 memory.go:184] no items to output this cycle
I0319 12:26:33.409818 543705 cpu.go:275] no items to output this cycle
E0319 12:26:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:26:43.409887 543705 cpu.go:282] Add success.
I0319 12:26:43.409894 543705 memory.go:191] Add success.
I0319 12:26:43.419732 543705 net.go:648] Add success.
I0319 12:26:43.422985 543705 net.go:770] primary dev: ETH0
I0319 12:26:43.422999 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:26:43.423012 543705 net.go:698] Add success.
I0319 12:26:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:26:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:26:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:26:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:26:53.409768 543705 memory.go:184] no items to output this cycle
I0319 12:26:53.409789 543705 cpu.go:275] no items to output this cycle
E0319 12:27:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:27:03.409789 543705 memory.go:184] no items to output this cycle
I0319 12:27:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 12:27:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:27:13.409781 543705 memory.go:191] Add success.
I0319 12:27:13.409803 543705 cpu.go:282] Add success.
W0319 12:27:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:27:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:27:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:27:13.420176 543705 net.go:648] Add success.
I0319 12:27:13.423281 543705 net.go:770] primary dev: ETH0
I0319 12:27:13.423295 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:27:13.423307 543705 net.go:698] Add success.
I0319 12:27:13.429702 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 12:27:13.452938 543705 event_worker.go:152] Polling the log file for events...
I0319 12:27:13.468361 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5be117a5-59be-4a36-8e53-058b9a411dba","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 12:27:13.468394 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 12:27:14.455106 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:27:14.455163 543705 disk_worker.go:708] disk space is not compliant
W0319 12:27:14.455166 543705 disk_worker.go:728] disk inode is not compliant
E0319 12:27:14.456967 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 12:27:14.456976 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 12:27:14.456981 543705 custom_config.go:64] query custom config with name: gpu
I0319 12:27:14.456982 543705 disk_worker.go:494] system disk:vda1
I0319 12:27:14.457011 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 12:27:15.456826 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 12:27:15.456835 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:27:16.457948 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 12:27:16.457956 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 12:27:16.458001 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:27:16.458017 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:27:16.472353 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:27:19.125671 543705 disk_info.go:125] begin check local disk info of client
I0319 12:27:19.128005 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:27:19.128011 543705 disk_info.go:196] parse disk info done, disk is : [0xc000508d00 0xc000508d40]
E0319 12:27:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:27:23.409771 543705 memory.go:184] no items to output this cycle
I0319 12:27:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 12:27:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:27:33.409801 543705 memory.go:184] no items to output this cycle
I0319 12:27:33.409817 543705 cpu.go:275] no items to output this cycle
I0319 12:27:37.704229 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 12:27:37.704236 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 12:27:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:27:43.410806 543705 memory.go:191] Add success.
I0319 12:27:43.409791 543705 cpu.go:282] Add success.
I0319 12:27:43.419771 543705 net.go:648] Add success.
I0319 12:27:43.422660 543705 net.go:770] primary dev: ETH0
I0319 12:27:43.422686 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:27:43.422701 543705 net.go:698] Add success.
I0319 12:27:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:27:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:27:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:27:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:27:53.409785 543705 memory.go:184] no items to output this cycle
I0319 12:27:53.409785 543705 cpu.go:275] no items to output this cycle
E0319 12:28:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:28:03.409800 543705 memory.go:184] no items to output this cycle
I0319 12:28:03.409806 543705 cpu.go:275] no items to output this cycle
E0319 12:28:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:28:13.409827 543705 memory.go:191] Add success.
I0319 12:28:13.409835 543705 cpu.go:282] Add success.
W0319 12:28:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:28:13.409877 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:28:13.409880 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:28:13.420134 543705 net.go:648] Add success.
I0319 12:28:13.423046 543705 net.go:770] primary dev: ETH0
I0319 12:28:13.423061 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:28:13.423076 543705 net.go:698] Add success.
I0319 12:28:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:28:14.455106 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:28:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0319 12:28:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:28:14.456489 543705 disk_worker.go:494] system disk:vda1
I0319 12:28:14.456532 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:28:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:28:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:28:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:28:16.458078 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:28:16.472421 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:28:19.129672 543705 disk_info.go:125] begin check local disk info of client
I0319 12:28:19.132031 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:28:19.132037 543705 disk_info.go:196] parse disk info done, disk is : [0xc000481740 0xc000481780]
E0319 12:28:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:28:23.409805 543705 memory.go:184] no items to output this cycle
I0319 12:28:23.409818 543705 cpu.go:275] no items to output this cycle
E0319 12:28:33.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:28:33.409821 543705 memory.go:184] no items to output this cycle
I0319 12:28:33.409835 543705 cpu.go:275] no items to output this cycle
E0319 12:28:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:28:43.409786 543705 memory.go:191] Add success.
I0319 12:28:43.409811 543705 cpu.go:282] Add success.
I0319 12:28:43.420190 543705 net.go:648] Add success.
I0319 12:28:43.423092 543705 net.go:770] primary dev: ETH0
I0319 12:28:43.423104 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:28:43.423116 543705 net.go:698] Add success.
I0319 12:28:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:28:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:28:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:28:53.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:28:53.409811 543705 memory.go:184] no items to output this cycle
I0319 12:28:53.409823 543705 cpu.go:275] no items to output this cycle
E0319 12:29:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:29:03.409787 543705 memory.go:184] no items to output this cycle
I0319 12:29:03.409809 543705 cpu.go:275] no items to output this cycle
E0319 12:29:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:29:13.409824 543705 memory.go:191] Add success.
I0319 12:29:13.409834 543705 cpu.go:282] Add success.
W0319 12:29:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:29:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:29:13.409878 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:29:13.420139 543705 net.go:648] Add success.
I0319 12:29:13.423037 543705 net.go:770] primary dev: ETH0
I0319 12:29:13.423050 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:29:13.423062 543705 net.go:698] Add success.
I0319 12:29:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:29:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:29:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 12:29:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:29:14.456564 543705 disk_worker.go:494] system disk:vda1
I0319 12:29:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:29:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:29:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:29:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:29:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:29:16.472426 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:29:19.133672 543705 disk_info.go:125] begin check local disk info of client
I0319 12:29:19.136014 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:29:19.136021 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003978c0 0xc000397c40]
E0319 12:29:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:29:23.409777 543705 memory.go:184] no items to output this cycle
I0319 12:29:23.409782 543705 cpu.go:275] no items to output this cycle
E0319 12:29:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:29:33.409779 543705 memory.go:184] no items to output this cycle
I0319 12:29:33.409781 543705 cpu.go:275] no items to output this cycle
E0319 12:29:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:29:43.409787 543705 memory.go:191] Add success.
I0319 12:29:43.409803 543705 cpu.go:282] Add success.
I0319 12:29:43.419891 543705 net.go:648] Add success.
I0319 12:29:43.423187 543705 net.go:770] primary dev: ETH0
I0319 12:29:43.423202 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:29:43.423220 543705 net.go:698] Add success.
I0319 12:29:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:29:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:29:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:29:53.410238 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:29:53.410255 543705 memory.go:184] no items to output this cycle
I0319 12:29:53.410285 543705 cpu.go:275] no items to output this cycle
E0319 12:30:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:30:03.409799 543705 memory.go:184] no items to output this cycle
I0319 12:30:03.409797 543705 cpu.go:275] no items to output this cycle
E0319 12:30:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:30:13.409781 543705 memory.go:191] Add success.
W0319 12:30:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 12:30:13.409810 543705 cpu.go:282] Add success.
W0319 12:30:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:30:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:30:13.420162 543705 net.go:648] Add success.
I0319 12:30:13.423081 543705 net.go:770] primary dev: ETH0
I0319 12:30:13.423095 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:30:13.423107 543705 net.go:698] Add success.
I0319 12:30:13.518421 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2ba94a5f-a500-49e0-9ad9-951c6b9de644","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 12:30:13.518454 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 12:30:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:30:14.455138 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:30:14.455222 543705 disk_worker.go:708] disk space is not compliant
W0319 12:30:14.455225 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:30:14.456614 543705 disk_worker.go:494] system disk:vda1
I0319 12:30:14.456644 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:30:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:30:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:30:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:30:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:30:16.472421 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:30:19.137672 543705 disk_info.go:125] begin check local disk info of client
I0319 12:30:19.140034 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:30:19.140040 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001abe00 0xc0001abe40]
E0319 12:30:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:30:23.409788 543705 memory.go:184] no items to output this cycle
I0319 12:30:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 12:30:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:30:33.409811 543705 memory.go:184] no items to output this cycle
I0319 12:30:33.409823 543705 cpu.go:275] no items to output this cycle
I0319 12:30:37.705219 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 12:30:37.705226 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 12:30:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:30:43.411020 543705 memory.go:191] Add success.
I0319 12:30:43.409814 543705 cpu.go:282] Add success.
I0319 12:30:43.419698 543705 net.go:648] Add success.
I0319 12:30:43.422451 543705 net.go:770] primary dev: ETH0
I0319 12:30:43.422465 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:30:43.422479 543705 net.go:698] Add success.
I0319 12:30:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:30:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:30:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:30:53.409873 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:30:53.409892 543705 memory.go:184] no items to output this cycle
I0319 12:30:53.410029 543705 cpu.go:275] no items to output this cycle
E0319 12:31:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:31:03.409773 543705 memory.go:184] no items to output this cycle
I0319 12:31:03.409813 543705 cpu.go:275] no items to output this cycle
E0319 12:31:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:31:13.409775 543705 memory.go:191] Add success.
W0319 12:31:13.409801 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 12:31:13.409808 543705 cpu.go:282] Add success.
W0319 12:31:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:31:13.409816 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:31:13.420171 543705 net.go:648] Add success.
I0319 12:31:13.422912 543705 net.go:770] primary dev: ETH0
I0319 12:31:13.422926 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:31:13.422949 543705 net.go:698] Add success.
I0319 12:31:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:31:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:31:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0319 12:31:14.455170 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:31:14.456499 543705 disk_worker.go:494] system disk:vda1
I0319 12:31:14.456543 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:31:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:31:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:31:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:31:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:31:16.472370 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:31:19.141674 543705 disk_info.go:125] begin check local disk info of client
I0319 12:31:19.144041 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:31:19.144048 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab8c0 0xc0001ab900]
E0319 12:31:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:31:23.409764 543705 memory.go:184] no items to output this cycle
I0319 12:31:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 12:31:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:31:33.409779 543705 memory.go:184] no items to output this cycle
I0319 12:31:33.409784 543705 cpu.go:275] no items to output this cycle
E0319 12:31:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:31:43.409813 543705 memory.go:191] Add success.
I0319 12:31:43.409822 543705 cpu.go:282] Add success.
I0319 12:31:43.419970 543705 net.go:648] Add success.
I0319 12:31:43.423258 543705 net.go:770] primary dev: ETH0
I0319 12:31:43.423271 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:31:43.423283 543705 net.go:698] Add success.
I0319 12:31:46.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:31:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:31:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:31:53.410464 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:31:53.410579 543705 cpu.go:275] no items to output this cycle
I0319 12:31:53.410591 543705 memory.go:184] no items to output this cycle
E0319 12:32:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:32:03.409813 543705 memory.go:184] no items to output this cycle
I0319 12:32:03.409830 543705 cpu.go:275] no items to output this cycle
E0319 12:32:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:32:13.409813 543705 memory.go:191] Add success.
I0319 12:32:13.409829 543705 cpu.go:282] Add success.
W0319 12:32:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:32:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:32:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:32:13.420198 543705 net.go:648] Add success.
I0319 12:32:13.422853 543705 net.go:770] primary dev: ETH0
I0319 12:32:13.422867 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:32:13.422878 543705 net.go:698] Add success.
W0319 12:32:14.455152 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:32:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0319 12:32:14.455165 543705 disk_worker.go:728] disk inode is not compliant
E0319 12:32:14.456899 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 12:32:14.456908 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 12:32:14.456915 543705 custom_config.go:64] query custom config with name: gpu
I0319 12:32:14.456996 543705 disk_worker.go:494] system disk:vda1
I0319 12:32:14.457050 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 12:32:15.456807 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 12:32:15.456816 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:32:16.457926 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 12:32:16.457926 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 12:32:16.457981 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:32:16.458007 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:32:16.472441 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:32:19.145671 543705 disk_info.go:125] begin check local disk info of client
I0319 12:32:19.148047 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:32:19.148053 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ee380 0xc0003ee3c0]
E0319 12:32:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:32:23.409771 543705 memory.go:184] no items to output this cycle
I0319 12:32:23.409782 543705 cpu.go:275] no items to output this cycle
E0319 12:32:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:32:33.409810 543705 memory.go:184] no items to output this cycle
I0319 12:32:33.409820 543705 cpu.go:275] no items to output this cycle
E0319 12:32:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:32:43.409775 543705 memory.go:191] Add success.
I0319 12:32:43.409814 543705 cpu.go:282] Add success.
I0319 12:32:43.419848 543705 net.go:648] Add success.
I0319 12:32:43.422669 543705 net.go:770] primary dev: ETH0
I0319 12:32:43.422683 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:32:43.422696 543705 net.go:698] Add success.
I0319 12:32:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:32:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:32:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:32:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:32:53.409773 543705 memory.go:184] no items to output this cycle
I0319 12:32:53.409798 543705 cpu.go:275] no items to output this cycle
E0319 12:33:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:33:03.409793 543705 memory.go:184] no items to output this cycle
I0319 12:33:03.409811 543705 cpu.go:275] no items to output this cycle
E0319 12:33:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:33:13.409813 543705 memory.go:191] Add success.
I0319 12:33:13.409817 543705 cpu.go:282] Add success.
W0319 12:33:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:33:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:33:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:33:13.420248 543705 net.go:648] Add success.
I0319 12:33:13.423032 543705 net.go:770] primary dev: ETH0
I0319 12:33:13.423045 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:33:13.423057 543705 net.go:698] Add success.
I0319 12:33:13.471691 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9ea0ee75-6b42-4099-abf7-a204be223434","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 12:33:13.471724 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 12:33:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:33:14.455216 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:33:14.455226 543705 disk_worker.go:708] disk space is not compliant
W0319 12:33:14.455229 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:33:14.456634 543705 disk_worker.go:494] system disk:vda1
I0319 12:33:14.456665 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:33:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:33:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:33:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:33:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:33:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:33:19.149672 543705 disk_info.go:125] begin check local disk info of client
I0319 12:33:19.152130 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:33:19.152136 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007ad40 0xc00007ad80]
E0319 12:33:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:33:23.409765 543705 memory.go:184] no items to output this cycle
I0319 12:33:23.409791 543705 cpu.go:275] no items to output this cycle
E0319 12:33:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:33:33.409785 543705 memory.go:184] no items to output this cycle
I0319 12:33:33.409788 543705 cpu.go:275] no items to output this cycle
I0319 12:33:37.705730 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 12:33:37.705736 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 12:33:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:33:43.410699 543705 memory.go:191] Add success.
I0319 12:33:43.409791 543705 cpu.go:282] Add success.
I0319 12:33:43.420395 543705 net.go:648] Add success.
I0319 12:33:43.423057 543705 net.go:770] primary dev: ETH0
I0319 12:33:43.423076 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:33:43.423096 543705 net.go:698] Add success.
I0319 12:33:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:33:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:33:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:33:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:33:53.409776 543705 memory.go:184] no items to output this cycle
I0319 12:33:53.409781 543705 cpu.go:275] no items to output this cycle
E0319 12:34:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:34:03.409788 543705 memory.go:184] no items to output this cycle
I0319 12:34:03.409805 543705 cpu.go:275] no items to output this cycle
E0319 12:34:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:34:13.409816 543705 memory.go:191] Add success.
I0319 12:34:13.409819 543705 cpu.go:282] Add success.
W0319 12:34:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:34:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:34:13.409860 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:34:13.420163 543705 net.go:648] Add success.
I0319 12:34:13.423282 543705 net.go:770] primary dev: ETH0
I0319 12:34:13.423297 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:34:13.423312 543705 net.go:698] Add success.
I0319 12:34:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:34:14.455207 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:34:14.455218 543705 disk_worker.go:708] disk space is not compliant
W0319 12:34:14.455220 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:34:14.456611 543705 disk_worker.go:494] system disk:vda1
I0319 12:34:14.456642 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:34:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:34:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:34:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:34:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:34:16.472411 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:34:19.153671 543705 disk_info.go:125] begin check local disk info of client
I0319 12:34:19.156042 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:34:19.156048 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007afc0 0xc00007b000]
E0319 12:34:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:34:23.409758 543705 memory.go:184] no items to output this cycle
I0319 12:34:23.409787 543705 cpu.go:275] no items to output this cycle
E0319 12:34:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:34:33.409782 543705 memory.go:184] no items to output this cycle
I0319 12:34:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 12:34:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:34:43.409808 543705 memory.go:191] Add success.
I0319 12:34:43.409816 543705 cpu.go:282] Add success.
I0319 12:34:43.419948 543705 net.go:648] Add success.
I0319 12:34:43.422804 543705 net.go:770] primary dev: ETH0
I0319 12:34:43.422819 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:34:43.422830 543705 net.go:698] Add success.
I0319 12:34:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:34:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:34:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:34:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:34:53.409805 543705 memory.go:184] no items to output this cycle
I0319 12:34:53.409816 543705 cpu.go:275] no items to output this cycle
E0319 12:35:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:35:03.409808 543705 memory.go:184] no items to output this cycle
I0319 12:35:03.409824 543705 cpu.go:275] no items to output this cycle
E0319 12:35:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:35:13.409813 543705 memory.go:191] Add success.
I0319 12:35:13.409822 543705 cpu.go:282] Add success.
W0319 12:35:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:35:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:35:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:35:13.420179 543705 net.go:648] Add success.
I0319 12:35:13.423051 543705 net.go:770] primary dev: ETH0
I0319 12:35:13.423065 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:35:13.423076 543705 net.go:698] Add success.
I0319 12:35:14.454978 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:35:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:35:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0319 12:35:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:35:14.456588 543705 disk_worker.go:494] system disk:vda1
I0319 12:35:14.456617 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:35:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:35:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:35:16.458025 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:35:16.458046 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:35:16.472368 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:35:19.157673 543705 disk_info.go:125] begin check local disk info of client
I0319 12:35:19.160036 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:35:19.160042 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007ab40 0xc00007ab80]
E0319 12:35:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:35:23.409774 543705 memory.go:184] no items to output this cycle
I0319 12:35:23.409774 543705 cpu.go:275] no items to output this cycle
E0319 12:35:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:35:33.409770 543705 memory.go:184] no items to output this cycle
I0319 12:35:33.409796 543705 cpu.go:275] no items to output this cycle
E0319 12:35:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:35:43.409792 543705 memory.go:191] Add success.
I0319 12:35:43.409793 543705 cpu.go:282] Add success.
I0319 12:35:43.419984 543705 net.go:648] Add success.
I0319 12:35:43.423133 543705 net.go:770] primary dev: ETH0
I0319 12:35:43.423147 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:35:43.423160 543705 net.go:698] Add success.
I0319 12:35:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:35:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:35:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:35:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:35:53.409902 543705 cpu.go:275] no items to output this cycle
I0319 12:35:53.409905 543705 memory.go:184] no items to output this cycle
E0319 12:36:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:36:03.409795 543705 memory.go:184] no items to output this cycle
I0319 12:36:03.409800 543705 cpu.go:275] no items to output this cycle
E0319 12:36:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:36:13.409816 543705 memory.go:191] Add success.
I0319 12:36:13.409834 543705 cpu.go:282] Add success.
W0319 12:36:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:36:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:36:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:36:13.420371 543705 net.go:648] Add success.
I0319 12:36:13.423552 543705 net.go:770] primary dev: ETH0
I0319 12:36:13.423565 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:36:13.423578 543705 net.go:698] Add success.
I0319 12:36:13.463962 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7ec9f1ae-96a9-4401-8c45-0ffd2de2eaf0","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 12:36:13.463995 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 12:36:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:36:14.455111 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:36:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0319 12:36:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:36:14.456521 543705 disk_worker.go:494] system disk:vda1
I0319 12:36:14.456564 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:36:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:36:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:36:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:36:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:36:16.472366 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:36:19.161692 543705 disk_info.go:125] begin check local disk info of client
I0319 12:36:19.164055 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:36:19.164061 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003efbc0 0xc0003efc00]
E0319 12:36:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:36:23.409798 543705 memory.go:184] no items to output this cycle
I0319 12:36:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 12:36:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:36:33.409780 543705 memory.go:184] no items to output this cycle
I0319 12:36:33.409813 543705 cpu.go:275] no items to output this cycle
I0319 12:36:37.707233 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 12:36:37.707239 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 12:36:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:36:43.410871 543705 memory.go:191] Add success.
I0319 12:36:43.410002 543705 cpu.go:282] Add success.
I0319 12:36:43.419734 543705 net.go:648] Add success.
I0319 12:36:43.422614 543705 net.go:770] primary dev: ETH0
I0319 12:36:43.422628 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:36:43.422642 543705 net.go:698] Add success.
I0319 12:36:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:36:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:36:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:36:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:36:53.409785 543705 cpu.go:275] no items to output this cycle
I0319 12:36:53.409794 543705 memory.go:184] no items to output this cycle
E0319 12:37:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:37:03.409793 543705 cpu.go:275] no items to output this cycle
I0319 12:37:03.409797 543705 memory.go:184] no items to output this cycle
W0319 12:37:13.409701 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:37:13.409717 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:37:13.409722 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0319 12:37:13.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:37:13.409816 543705 memory.go:191] Add success.
I0319 12:37:13.409826 543705 cpu.go:282] Add success.
I0319 12:37:13.420105 543705 net.go:648] Add success.
I0319 12:37:13.422937 543705 net.go:770] primary dev: ETH0
I0319 12:37:13.422950 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:37:13.422962 543705 net.go:698] Add success.
I0319 12:37:13.453539 543705 event_worker.go:152] Polling the log file for events...
W0319 12:37:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:37:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0319 12:37:14.455189 543705 disk_worker.go:728] disk inode is not compliant
E0319 12:37:14.455889 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 12:37:14.455898 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 12:37:14.455904 543705 custom_config.go:64] query custom config with name: gpu
I0319 12:37:14.456567 543705 disk_worker.go:494] system disk:vda1
I0319 12:37:14.456597 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 12:37:15.456819 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 12:37:15.456828 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:37:16.457929 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 12:37:16.457929 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 12:37:16.457982 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:37:16.458001 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:37:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:37:19.165674 543705 disk_info.go:125] begin check local disk info of client
I0319 12:37:19.168010 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:37:19.168016 543705 disk_info.go:196] parse disk info done, disk is : [0xc000267380 0xc0002673c0]
E0319 12:37:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:37:23.409900 543705 cpu.go:275] no items to output this cycle
I0319 12:37:23.409905 543705 memory.go:184] no items to output this cycle
E0319 12:37:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:37:33.409784 543705 memory.go:184] no items to output this cycle
I0319 12:37:33.409786 543705 cpu.go:275] no items to output this cycle
E0319 12:37:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:37:43.409785 543705 memory.go:191] Add success.
I0319 12:37:43.409817 543705 cpu.go:282] Add success.
I0319 12:37:43.419886 543705 net.go:648] Add success.
I0319 12:37:43.422746 543705 net.go:770] primary dev: ETH0
I0319 12:37:43.422761 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:37:43.422774 543705 net.go:698] Add success.
I0319 12:37:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:37:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:37:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:37:53.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:37:53.409821 543705 memory.go:184] no items to output this cycle
I0319 12:37:53.409831 543705 cpu.go:275] no items to output this cycle
E0319 12:38:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:38:03.409791 543705 memory.go:184] no items to output this cycle
I0319 12:38:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 12:38:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:38:13.409786 543705 memory.go:191] Add success.
W0319 12:38:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 12:38:13.409821 543705 cpu.go:282] Add success.
W0319 12:38:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:38:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:38:13.420231 543705 net.go:648] Add success.
I0319 12:38:13.423148 543705 net.go:770] primary dev: ETH0
I0319 12:38:13.423161 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:38:13.423173 543705 net.go:698] Add success.
I0319 12:38:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:38:14.455143 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:38:14.455153 543705 disk_worker.go:708] disk space is not compliant
W0319 12:38:14.455156 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:38:14.456503 543705 disk_worker.go:494] system disk:vda1
I0319 12:38:14.456549 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:38:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:38:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:38:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:38:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:38:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:38:19.169674 543705 disk_info.go:125] begin check local disk info of client
I0319 12:38:19.172045 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:38:19.172051 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b6140 0xc0003b6180]
E0319 12:38:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:38:23.409790 543705 memory.go:184] no items to output this cycle
I0319 12:38:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 12:38:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:38:33.409792 543705 memory.go:184] no items to output this cycle
I0319 12:38:33.409798 543705 cpu.go:275] no items to output this cycle
E0319 12:38:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:38:43.409810 543705 memory.go:191] Add success.
I0319 12:38:43.409815 543705 cpu.go:282] Add success.
I0319 12:38:43.419884 543705 net.go:648] Add success.
I0319 12:38:43.422901 543705 net.go:770] primary dev: ETH0
I0319 12:38:43.422915 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:38:43.422930 543705 net.go:698] Add success.
I0319 12:38:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:38:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:38:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:38:53.410352 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:38:53.410368 543705 memory.go:184] no items to output this cycle
I0319 12:38:53.410375 543705 cpu.go:275] no items to output this cycle
E0319 12:39:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:39:03.409789 543705 memory.go:184] no items to output this cycle
I0319 12:39:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 12:39:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:39:13.409778 543705 memory.go:191] Add success.
W0319 12:39:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 12:39:13.409811 543705 cpu.go:282] Add success.
W0319 12:39:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:39:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:39:13.420104 543705 net.go:648] Add success.
I0319 12:39:13.423242 543705 net.go:770] primary dev: ETH0
I0319 12:39:13.423255 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:39:13.423267 543705 net.go:698] Add success.
I0319 12:39:13.470626 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"17b329d2-faef-4784-96d3-c16b8f8daa07","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 12:39:13.470660 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 12:39:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:39:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:39:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0319 12:39:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:39:14.456592 543705 disk_worker.go:494] system disk:vda1
I0319 12:39:14.456623 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:39:15.455974 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:39:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:39:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:39:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:39:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:39:19.173672 543705 disk_info.go:125] begin check local disk info of client
I0319 12:39:19.176051 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:39:19.176057 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004f64c0 0xc0004f6500]
E0319 12:39:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:39:23.409795 543705 memory.go:184] no items to output this cycle
I0319 12:39:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 12:39:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:39:33.409794 543705 memory.go:184] no items to output this cycle
I0319 12:39:33.409808 543705 cpu.go:275] no items to output this cycle
I0319 12:39:37.708248 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 12:39:37.708256 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 12:39:43.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:39:43.410777 543705 memory.go:191] Add success.
I0319 12:39:43.409815 543705 cpu.go:282] Add success.
I0319 12:39:43.420505 543705 net.go:648] Add success.
I0319 12:39:43.423150 543705 net.go:770] primary dev: ETH0
I0319 12:39:43.423163 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:39:43.423175 543705 net.go:698] Add success.
I0319 12:39:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:39:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:39:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:39:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:39:53.409771 543705 memory.go:184] no items to output this cycle
I0319 12:39:53.409807 543705 cpu.go:275] no items to output this cycle
E0319 12:40:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:40:03.409774 543705 memory.go:184] no items to output this cycle
I0319 12:40:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 12:40:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:40:13.409792 543705 memory.go:191] Add success.
I0319 12:40:13.409797 543705 cpu.go:282] Add success.
W0319 12:40:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:40:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:40:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:40:13.420045 543705 net.go:648] Add success.
I0319 12:40:13.422873 543705 net.go:770] primary dev: ETH0
I0319 12:40:13.422886 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:40:13.422898 543705 net.go:698] Add success.
I0319 12:40:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:40:14.455325 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:40:14.455427 543705 disk_worker.go:708] disk space is not compliant
W0319 12:40:14.455437 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:40:14.457053 543705 disk_worker.go:494] system disk:vda1
I0319 12:40:14.457081 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:40:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:40:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:40:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:40:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:40:16.472464 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:40:19.177672 543705 disk_info.go:125] begin check local disk info of client
I0319 12:40:19.180039 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:40:19.180045 543705 disk_info.go:196] parse disk info done, disk is : [0xc000374380 0xc0003743c0]
E0319 12:40:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:40:23.409765 543705 memory.go:184] no items to output this cycle
I0319 12:40:23.409797 543705 cpu.go:275] no items to output this cycle
E0319 12:40:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:40:33.409783 543705 memory.go:184] no items to output this cycle
I0319 12:40:33.409803 543705 cpu.go:275] no items to output this cycle
E0319 12:40:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:40:43.409789 543705 memory.go:191] Add success.
I0319 12:40:43.409795 543705 cpu.go:282] Add success.
I0319 12:40:43.419847 543705 net.go:648] Add success.
I0319 12:40:43.422680 543705 net.go:770] primary dev: ETH0
I0319 12:40:43.422693 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:40:43.422706 543705 net.go:698] Add success.
I0319 12:40:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:40:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:40:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:40:53.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:40:53.409811 543705 memory.go:184] no items to output this cycle
I0319 12:40:53.409819 543705 cpu.go:275] no items to output this cycle
E0319 12:41:03.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:41:03.409813 543705 memory.go:184] no items to output this cycle
I0319 12:41:03.409827 543705 cpu.go:275] no items to output this cycle
W0319 12:41:13.409717 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:41:13.409735 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:41:13.409741 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:41:13.409801 543705 cpu.go:282] Add success.
E0319 12:41:13.409843 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:41:13.409864 543705 memory.go:191] Add success.
I0319 12:41:13.420038 543705 net.go:648] Add success.
I0319 12:41:13.423060 543705 net.go:770] primary dev: ETH0
I0319 12:41:13.423075 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:41:13.423087 543705 net.go:698] Add success.
I0319 12:41:14.454981 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:41:14.455470 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:41:14.455484 543705 disk_worker.go:708] disk space is not compliant
W0319 12:41:14.455493 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:41:14.457064 543705 disk_worker.go:494] system disk:vda1
I0319 12:41:14.457092 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:41:15.454976 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:41:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:41:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:41:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:41:16.472390 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:41:19.181671 543705 disk_info.go:125] begin check local disk info of client
I0319 12:41:19.184057 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:41:19.184063 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034e100 0xc00034e140]
E0319 12:41:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:41:23.409784 543705 memory.go:184] no items to output this cycle
I0319 12:41:23.409786 543705 cpu.go:275] no items to output this cycle
E0319 12:41:33.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:41:33.409811 543705 memory.go:184] no items to output this cycle
I0319 12:41:33.409823 543705 cpu.go:275] no items to output this cycle
E0319 12:41:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:41:43.409788 543705 memory.go:191] Add success.
I0319 12:41:43.409813 543705 cpu.go:282] Add success.
I0319 12:41:43.419885 543705 net.go:648] Add success.
I0319 12:41:43.423206 543705 net.go:770] primary dev: ETH0
I0319 12:41:43.423219 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:41:43.423233 543705 net.go:698] Add success.
I0319 12:41:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:41:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:41:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:41:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:41:53.409778 543705 memory.go:184] no items to output this cycle
I0319 12:41:53.409777 543705 cpu.go:275] no items to output this cycle
E0319 12:42:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:42:03.409786 543705 memory.go:184] no items to output this cycle
I0319 12:42:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 12:42:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:42:13.409808 543705 memory.go:191] Add success.
I0319 12:42:13.409816 543705 cpu.go:282] Add success.
W0319 12:42:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:42:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:42:13.409859 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:42:13.420219 543705 net.go:648] Add success.
I0319 12:42:13.423088 543705 net.go:770] primary dev: ETH0
I0319 12:42:13.423103 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:42:13.423115 543705 net.go:698] Add success.
I0319 12:42:13.470004 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e95a1050-3383-4a60-a185-ad44431bb964","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 12:42:13.470035 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 12:42:14.455146 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:42:14.455156 543705 disk_worker.go:708] disk space is not compliant
W0319 12:42:14.455159 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:42:14.456931 543705 disk_worker.go:494] system disk:vda1
I0319 12:42:14.456971 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 12:42:14.457001 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 12:42:14.457010 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 12:42:14.457014 543705 custom_config.go:64] query custom config with name: gpu
E0319 12:42:15.456427 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 12:42:15.456436 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:42:16.457915 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 12:42:16.457916 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 12:42:16.457969 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:42:16.457990 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:42:16.472320 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:42:19.185672 543705 disk_info.go:125] begin check local disk info of client
I0319 12:42:19.187984 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:42:19.187990 543705 disk_info.go:196] parse disk info done, disk is : [0xc000492500 0xc000492540]
E0319 12:42:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:42:23.409790 543705 memory.go:184] no items to output this cycle
I0319 12:42:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 12:42:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:42:33.409783 543705 memory.go:184] no items to output this cycle
I0319 12:42:33.409791 543705 cpu.go:275] no items to output this cycle
I0319 12:42:37.709240 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 12:42:37.709247 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 12:42:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:42:43.410701 543705 memory.go:191] Add success.
I0319 12:42:43.409822 543705 cpu.go:282] Add success.
I0319 12:42:43.420480 543705 net.go:648] Add success.
I0319 12:42:43.423251 543705 net.go:770] primary dev: ETH0
I0319 12:42:43.423264 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:42:43.423276 543705 net.go:698] Add success.
I0319 12:42:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:42:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:42:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:42:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:42:53.409801 543705 memory.go:184] no items to output this cycle
I0319 12:42:53.409813 543705 cpu.go:275] no items to output this cycle
E0319 12:43:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:43:03.409794 543705 memory.go:184] no items to output this cycle
I0319 12:43:03.409796 543705 cpu.go:275] no items to output this cycle
E0319 12:43:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:43:13.409823 543705 memory.go:191] Add success.
I0319 12:43:13.409828 543705 cpu.go:282] Add success.
W0319 12:43:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:43:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:43:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:43:13.419887 543705 net.go:770] primary dev: ETH0
I0319 12:43:13.419899 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:43:13.419911 543705 net.go:698] Add success.
I0319 12:43:13.420464 543705 net.go:648] Add success.
I0319 12:43:14.453936 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:43:14.455222 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:43:14.455235 543705 disk_worker.go:708] disk space is not compliant
W0319 12:43:14.455237 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:43:14.456604 543705 disk_worker.go:494] system disk:vda1
I0319 12:43:14.456632 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:43:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:43:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:43:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:43:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:43:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:43:19.189679 543705 disk_info.go:125] begin check local disk info of client
I0319 12:43:19.192069 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:43:19.192075 543705 disk_info.go:196] parse disk info done, disk is : [0xc000394440 0xc000394480]
E0319 12:43:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:43:23.409784 543705 memory.go:184] no items to output this cycle
I0319 12:43:23.409785 543705 cpu.go:275] no items to output this cycle
E0319 12:43:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:43:33.409787 543705 memory.go:184] no items to output this cycle
I0319 12:43:33.409793 543705 cpu.go:275] no items to output this cycle
E0319 12:43:43.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:43:43.409826 543705 memory.go:191] Add success.
I0319 12:43:43.409839 543705 cpu.go:282] Add success.
I0319 12:43:43.419998 543705 net.go:648] Add success.
I0319 12:43:43.422673 543705 net.go:770] primary dev: ETH0
I0319 12:43:43.422696 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:43:43.422709 543705 net.go:698] Add success.
I0319 12:43:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:43:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:43:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:43:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:43:53.409808 543705 memory.go:184] no items to output this cycle
I0319 12:43:53.409818 543705 cpu.go:275] no items to output this cycle
E0319 12:44:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:44:03.409783 543705 memory.go:184] no items to output this cycle
I0319 12:44:03.409812 543705 cpu.go:275] no items to output this cycle
E0319 12:44:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:44:13.409801 543705 memory.go:191] Add success.
I0319 12:44:13.409806 543705 cpu.go:282] Add success.
W0319 12:44:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:44:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:44:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:44:13.419704 543705 net.go:648] Add success.
I0319 12:44:13.422606 543705 net.go:770] primary dev: ETH0
I0319 12:44:13.422619 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:44:13.422629 543705 net.go:698] Add success.
I0319 12:44:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:44:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:44:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0319 12:44:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:44:14.456570 543705 disk_worker.go:494] system disk:vda1
I0319 12:44:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:44:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:44:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:44:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:44:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:44:16.472361 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:44:19.193673 543705 disk_info.go:125] begin check local disk info of client
I0319 12:44:19.196113 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:44:19.196121 543705 disk_info.go:196] parse disk info done, disk is : [0xc000275a80 0xc000275ac0]
E0319 12:44:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:44:23.409770 543705 memory.go:184] no items to output this cycle
I0319 12:44:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 12:44:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:44:33.409794 543705 memory.go:184] no items to output this cycle
I0319 12:44:33.409799 543705 cpu.go:275] no items to output this cycle
E0319 12:44:43.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:44:43.409829 543705 memory.go:191] Add success.
I0319 12:44:43.409840 543705 cpu.go:282] Add success.
I0319 12:44:43.419993 543705 net.go:648] Add success.
I0319 12:44:43.422825 543705 net.go:770] primary dev: ETH0
I0319 12:44:43.422839 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:44:43.422851 543705 net.go:698] Add success.
I0319 12:44:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:44:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:44:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:44:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:44:53.409787 543705 memory.go:184] no items to output this cycle
I0319 12:44:53.409790 543705 cpu.go:275] no items to output this cycle
E0319 12:45:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:45:03.409785 543705 memory.go:184] no items to output this cycle
I0319 12:45:03.409814 543705 cpu.go:275] no items to output this cycle
E0319 12:45:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:45:13.409808 543705 memory.go:191] Add success.
I0319 12:45:13.409808 543705 cpu.go:282] Add success.
W0319 12:45:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:45:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:45:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:45:13.420125 543705 net.go:648] Add success.
I0319 12:45:13.423034 543705 net.go:770] primary dev: ETH0
I0319 12:45:13.423047 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:45:13.423058 543705 net.go:698] Add success.
I0319 12:45:13.464193 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7766f1ad-446f-4202-ad64-be3112d36738","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 12:45:13.464225 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 12:45:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:45:14.455211 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:45:14.455221 543705 disk_worker.go:708] disk space is not compliant
W0319 12:45:14.455224 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:45:14.456832 543705 disk_worker.go:494] system disk:vda1
I0319 12:45:14.456865 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:45:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:45:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:45:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:45:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:45:16.472446 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:45:19.197671 543705 disk_info.go:125] begin check local disk info of client
I0319 12:45:19.200028 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:45:19.200034 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b7c40 0xc0003b7c80]
E0319 12:45:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:45:23.409811 543705 memory.go:184] no items to output this cycle
I0319 12:45:23.409822 543705 cpu.go:275] no items to output this cycle
E0319 12:45:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:45:33.409778 543705 memory.go:184] no items to output this cycle
I0319 12:45:33.409780 543705 cpu.go:275] no items to output this cycle
I0319 12:45:37.709737 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 12:45:37.709744 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 12:45:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:45:43.410638 543705 memory.go:191] Add success.
I0319 12:45:43.409824 543705 cpu.go:282] Add success.
I0319 12:45:43.420364 543705 net.go:648] Add success.
I0319 12:45:43.423039 543705 net.go:770] primary dev: ETH0
I0319 12:45:43.423052 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:45:43.423065 543705 net.go:698] Add success.
I0319 12:45:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:45:46.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:45:46.458051 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:45:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:45:53.409784 543705 memory.go:184] no items to output this cycle
I0319 12:45:53.409783 543705 cpu.go:275] no items to output this cycle
E0319 12:46:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:46:03.409796 543705 memory.go:184] no items to output this cycle
I0319 12:46:03.409803 543705 cpu.go:275] no items to output this cycle
E0319 12:46:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:46:13.409788 543705 memory.go:191] Add success.
I0319 12:46:13.409810 543705 cpu.go:282] Add success.
W0319 12:46:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:46:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:46:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:46:13.420206 543705 net.go:648] Add success.
I0319 12:46:13.423194 543705 net.go:770] primary dev: ETH0
I0319 12:46:13.423207 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:46:13.423219 543705 net.go:698] Add success.
I0319 12:46:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:46:14.455105 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:46:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0319 12:46:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:46:14.456500 543705 disk_worker.go:494] system disk:vda1
I0319 12:46:14.456544 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:46:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:46:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:46:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:46:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:46:16.472385 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:46:19.201677 543705 disk_info.go:125] begin check local disk info of client
I0319 12:46:19.204076 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:46:19.204082 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001e7940 0xc0001e7980]
E0319 12:46:23.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:46:23.409818 543705 memory.go:184] no items to output this cycle
I0319 12:46:23.409831 543705 cpu.go:275] no items to output this cycle
E0319 12:46:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:46:33.409815 543705 memory.go:184] no items to output this cycle
I0319 12:46:33.409828 543705 cpu.go:275] no items to output this cycle
E0319 12:46:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:46:43.409785 543705 memory.go:191] Add success.
I0319 12:46:43.409815 543705 cpu.go:282] Add success.
I0319 12:46:43.419931 543705 net.go:648] Add success.
I0319 12:46:43.422729 543705 net.go:770] primary dev: ETH0
I0319 12:46:43.422743 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:46:43.422757 543705 net.go:698] Add success.
I0319 12:46:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:46:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:46:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:46:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:46:53.409799 543705 memory.go:184] no items to output this cycle
I0319 12:46:53.409809 543705 cpu.go:275] no items to output this cycle
E0319 12:47:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:47:03.409788 543705 memory.go:184] no items to output this cycle
I0319 12:47:03.409799 543705 cpu.go:275] no items to output this cycle
E0319 12:47:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:47:13.409800 543705 memory.go:191] Add success.
I0319 12:47:13.409811 543705 cpu.go:282] Add success.
W0319 12:47:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:47:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:47:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:47:13.420132 543705 net.go:648] Add success.
I0319 12:47:13.423367 543705 net.go:770] primary dev: ETH0
I0319 12:47:13.423380 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:47:13.423392 543705 net.go:698] Add success.
I0319 12:47:13.452986 543705 event_worker.go:152] Polling the log file for events...
W0319 12:47:14.455131 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:47:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0319 12:47:14.455200 543705 disk_worker.go:728] disk inode is not compliant
E0319 12:47:14.456957 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 12:47:14.456965 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 12:47:14.456971 543705 custom_config.go:64] query custom config with name: gpu
I0319 12:47:14.457029 543705 disk_worker.go:494] system disk:vda1
I0319 12:47:14.457058 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 12:47:15.456836 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 12:47:15.456845 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:47:16.457926 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 12:47:16.457926 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 12:47:16.457984 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:47:16.458004 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:47:16.472335 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:47:19.205672 543705 disk_info.go:125] begin check local disk info of client
I0319 12:47:19.208036 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:47:19.208041 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007ba00 0xc00007ba40]
E0319 12:47:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:47:23.409767 543705 memory.go:184] no items to output this cycle
I0319 12:47:23.409790 543705 cpu.go:275] no items to output this cycle
E0319 12:47:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:47:33.409779 543705 memory.go:184] no items to output this cycle
I0319 12:47:33.409785 543705 cpu.go:275] no items to output this cycle
E0319 12:47:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:47:43.409780 543705 memory.go:191] Add success.
I0319 12:47:43.409815 543705 cpu.go:282] Add success.
I0319 12:47:43.419875 543705 net.go:648] Add success.
I0319 12:47:43.423053 543705 net.go:770] primary dev: ETH0
I0319 12:47:43.423066 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:47:43.423080 543705 net.go:698] Add success.
I0319 12:47:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:47:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:47:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:47:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:47:53.409805 543705 memory.go:184] no items to output this cycle
I0319 12:47:53.409815 543705 cpu.go:275] no items to output this cycle
E0319 12:48:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:48:03.409789 543705 memory.go:184] no items to output this cycle
I0319 12:48:03.409796 543705 cpu.go:275] no items to output this cycle
E0319 12:48:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:48:13.409825 543705 memory.go:191] Add success.
I0319 12:48:13.409829 543705 cpu.go:282] Add success.
W0319 12:48:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:48:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:48:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:48:13.420133 543705 net.go:648] Add success.
I0319 12:48:13.423695 543705 net.go:770] primary dev: ETH0
I0319 12:48:13.423708 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:48:13.423720 543705 net.go:698] Add success.
I0319 12:48:13.545566 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"74a6a192-4434-4307-95af-0c769298058a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 12:48:13.545600 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 12:48:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:48:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:48:14.455160 543705 disk_worker.go:708] disk space is not compliant
W0319 12:48:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:48:14.456487 543705 disk_worker.go:494] system disk:vda1
I0319 12:48:14.456530 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:48:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:48:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:48:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:48:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:48:16.472390 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:48:19.209671 543705 disk_info.go:125] begin check local disk info of client
I0319 12:48:19.212085 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:48:19.212091 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5c80 0xc0000c5cc0]
E0319 12:48:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:48:23.409779 543705 memory.go:184] no items to output this cycle
I0319 12:48:23.409787 543705 cpu.go:275] no items to output this cycle
E0319 12:48:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:48:33.409814 543705 memory.go:184] no items to output this cycle
I0319 12:48:33.409822 543705 cpu.go:275] no items to output this cycle
I0319 12:48:37.711250 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 12:48:37.711257 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 12:48:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:48:43.410751 543705 memory.go:191] Add success.
I0319 12:48:43.409823 543705 cpu.go:282] Add success.
I0319 12:48:43.420462 543705 net.go:648] Add success.
I0319 12:48:43.423431 543705 net.go:770] primary dev: ETH0
I0319 12:48:43.423448 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:48:43.423463 543705 net.go:698] Add success.
I0319 12:48:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:48:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:48:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:48:53.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:48:53.409763 543705 memory.go:184] no items to output this cycle
I0319 12:48:53.409791 543705 cpu.go:275] no items to output this cycle
E0319 12:49:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:49:03.409775 543705 memory.go:184] no items to output this cycle
I0319 12:49:03.409804 543705 cpu.go:275] no items to output this cycle
E0319 12:49:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:49:13.409794 543705 memory.go:191] Add success.
I0319 12:49:13.409805 543705 cpu.go:282] Add success.
W0319 12:49:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:49:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:49:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:49:13.420267 543705 net.go:648] Add success.
I0319 12:49:13.423304 543705 net.go:770] primary dev: ETH0
I0319 12:49:13.423317 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:49:13.423329 543705 net.go:698] Add success.
I0319 12:49:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:49:14.455167 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:49:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0319 12:49:14.455180 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:49:14.456572 543705 disk_worker.go:494] system disk:vda1
I0319 12:49:14.456602 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:49:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:49:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:49:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:49:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:49:16.472372 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:49:19.213675 543705 disk_info.go:125] begin check local disk info of client
I0319 12:49:19.216257 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:49:19.216263 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a8040 0xc0004a8080]
E0319 12:49:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:49:23.409790 543705 memory.go:184] no items to output this cycle
I0319 12:49:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 12:49:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:49:33.409769 543705 memory.go:184] no items to output this cycle
I0319 12:49:33.409798 543705 cpu.go:275] no items to output this cycle
E0319 12:49:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:49:43.409798 543705 memory.go:191] Add success.
I0319 12:49:43.409799 543705 cpu.go:282] Add success.
I0319 12:49:43.419877 543705 net.go:648] Add success.
I0319 12:49:43.422966 543705 net.go:770] primary dev: ETH0
I0319 12:49:43.422979 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:49:43.422990 543705 net.go:698] Add success.
I0319 12:49:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:49:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:49:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:49:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:49:53.409766 543705 memory.go:184] no items to output this cycle
I0319 12:49:53.409794 543705 cpu.go:275] no items to output this cycle
E0319 12:50:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:50:03.409811 543705 memory.go:184] no items to output this cycle
I0319 12:50:03.409822 543705 cpu.go:275] no items to output this cycle
E0319 12:50:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:50:13.409811 543705 memory.go:191] Add success.
I0319 12:50:13.409815 543705 cpu.go:282] Add success.
W0319 12:50:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:50:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:50:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:50:13.420127 543705 net.go:648] Add success.
I0319 12:50:13.422939 543705 net.go:770] primary dev: ETH0
I0319 12:50:13.422957 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:50:13.422971 543705 net.go:698] Add success.
I0319 12:50:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:50:14.455094 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:50:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0319 12:50:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:50:14.456550 543705 disk_worker.go:494] system disk:vda1
I0319 12:50:14.456580 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:50:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:50:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:50:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:50:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:50:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:50:19.217670 543705 disk_info.go:125] begin check local disk info of client
I0319 12:50:19.220029 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:50:19.220034 543705 disk_info.go:196] parse disk info done, disk is : [0xc000382440 0xc000382480]
E0319 12:50:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:50:23.409796 543705 memory.go:184] no items to output this cycle
I0319 12:50:23.409809 543705 cpu.go:275] no items to output this cycle
E0319 12:50:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:50:33.409807 543705 memory.go:184] no items to output this cycle
I0319 12:50:33.409819 543705 cpu.go:275] no items to output this cycle
E0319 12:50:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:50:43.409785 543705 memory.go:191] Add success.
I0319 12:50:43.409816 543705 cpu.go:282] Add success.
I0319 12:50:43.419888 543705 net.go:648] Add success.
I0319 12:50:43.422763 543705 net.go:770] primary dev: ETH0
I0319 12:50:43.422779 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:50:43.422794 543705 net.go:698] Add success.
I0319 12:50:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:50:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:50:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:50:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:50:53.409777 543705 memory.go:184] no items to output this cycle
I0319 12:50:53.409779 543705 cpu.go:275] no items to output this cycle
E0319 12:51:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:51:03.409775 543705 memory.go:184] no items to output this cycle
I0319 12:51:03.409804 543705 cpu.go:275] no items to output this cycle
E0319 12:51:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:51:13.409813 543705 memory.go:191] Add success.
I0319 12:51:13.409818 543705 cpu.go:282] Add success.
W0319 12:51:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:51:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:51:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:51:13.420112 543705 net.go:648] Add success.
I0319 12:51:13.422976 543705 net.go:770] primary dev: ETH0
I0319 12:51:13.422989 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:51:13.423001 543705 net.go:698] Add success.
I0319 12:51:13.491205 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"49c6f160-7c57-4f51-a000-ef2366f77c31","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 12:51:13.491246 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 12:51:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:51:14.455101 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:51:14.455163 543705 disk_worker.go:708] disk space is not compliant
W0319 12:51:14.455166 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:51:14.456528 543705 disk_worker.go:494] system disk:vda1
I0319 12:51:14.456598 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:51:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:51:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:51:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:51:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:51:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:51:19.221673 543705 disk_info.go:125] begin check local disk info of client
I0319 12:51:19.224061 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:51:19.224067 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002565c0 0xc000256600]
E0319 12:51:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:51:23.409766 543705 memory.go:184] no items to output this cycle
I0319 12:51:23.409792 543705 cpu.go:275] no items to output this cycle
E0319 12:51:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:51:33.409797 543705 memory.go:184] no items to output this cycle
I0319 12:51:33.409812 543705 cpu.go:275] no items to output this cycle
I0319 12:51:37.712259 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 12:51:37.712268 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 12:51:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:51:43.410833 543705 memory.go:191] Add success.
I0319 12:51:43.409801 543705 cpu.go:282] Add success.
I0319 12:51:43.420513 543705 net.go:648] Add success.
I0319 12:51:43.423493 543705 net.go:770] primary dev: ETH0
I0319 12:51:43.423511 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:51:43.423524 543705 net.go:698] Add success.
I0319 12:51:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:51:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:51:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:51:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:51:53.409777 543705 memory.go:184] no items to output this cycle
I0319 12:51:53.409781 543705 cpu.go:275] no items to output this cycle
E0319 12:52:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:52:03.409808 543705 memory.go:184] no items to output this cycle
I0319 12:52:03.409822 543705 cpu.go:275] no items to output this cycle
E0319 12:52:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:52:13.409812 543705 memory.go:191] Add success.
I0319 12:52:13.409824 543705 cpu.go:282] Add success.
W0319 12:52:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:52:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:52:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:52:13.420107 543705 net.go:648] Add success.
I0319 12:52:13.422849 543705 net.go:770] primary dev: ETH0
I0319 12:52:13.422865 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:52:13.422879 543705 net.go:698] Add success.
W0319 12:52:14.455164 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:52:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0319 12:52:14.455177 543705 disk_worker.go:728] disk inode is not compliant
E0319 12:52:14.456932 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 12:52:14.456941 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 12:52:14.456948 543705 custom_config.go:64] query custom config with name: gpu
I0319 12:52:14.456999 543705 disk_worker.go:494] system disk:vda1
I0319 12:52:14.457044 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 12:52:15.456821 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 12:52:15.456831 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:52:16.457933 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 12:52:16.457944 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 12:52:16.457985 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:52:16.458003 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:52:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:52:19.225680 543705 disk_info.go:125] begin check local disk info of client
I0319 12:52:19.227999 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:52:19.228005 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b8300 0xc0003b8340]
E0319 12:52:23.410218 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:52:23.410237 543705 memory.go:184] no items to output this cycle
I0319 12:52:23.410248 543705 cpu.go:275] no items to output this cycle
E0319 12:52:33.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:52:33.409815 543705 memory.go:184] no items to output this cycle
I0319 12:52:33.409826 543705 cpu.go:275] no items to output this cycle
E0319 12:52:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:52:43.409818 543705 memory.go:191] Add success.
I0319 12:52:43.409829 543705 cpu.go:282] Add success.
I0319 12:52:43.419966 543705 net.go:648] Add success.
I0319 12:52:43.423283 543705 net.go:770] primary dev: ETH0
I0319 12:52:43.423297 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:52:43.423309 543705 net.go:698] Add success.
I0319 12:52:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:52:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:52:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:52:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:52:53.409771 543705 memory.go:184] no items to output this cycle
I0319 12:52:53.409809 543705 cpu.go:275] no items to output this cycle
E0319 12:53:03.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:53:03.409813 543705 memory.go:184] no items to output this cycle
I0319 12:53:03.409826 543705 cpu.go:275] no items to output this cycle
E0319 12:53:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:53:13.409792 543705 memory.go:191] Add success.
W0319 12:53:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 12:53:13.409822 543705 cpu.go:282] Add success.
W0319 12:53:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:53:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:53:13.420135 543705 net.go:648] Add success.
I0319 12:53:13.423166 543705 net.go:770] primary dev: ETH0
I0319 12:53:13.423181 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:53:13.423192 543705 net.go:698] Add success.
I0319 12:53:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:53:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:53:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0319 12:53:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:53:14.456561 543705 disk_worker.go:494] system disk:vda1
I0319 12:53:14.456590 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:53:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:53:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:53:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:53:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:53:16.472437 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:53:19.229664 543705 disk_info.go:125] begin check local disk info of client
I0319 12:53:19.232074 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:53:19.232080 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003aa600 0xc0003aa640]
E0319 12:53:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:53:23.409778 543705 memory.go:184] no items to output this cycle
I0319 12:53:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 12:53:33.409804 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:53:33.409828 543705 memory.go:184] no items to output this cycle
I0319 12:53:33.409841 543705 cpu.go:275] no items to output this cycle
E0319 12:53:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:53:43.409828 543705 memory.go:191] Add success.
I0319 12:53:43.409839 543705 cpu.go:282] Add success.
I0319 12:53:43.419932 543705 net.go:648] Add success.
I0319 12:53:43.422836 543705 net.go:770] primary dev: ETH0
I0319 12:53:43.422848 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:53:43.422861 543705 net.go:698] Add success.
I0319 12:53:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:53:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:53:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:53:53.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:53:53.409807 543705 memory.go:184] no items to output this cycle
I0319 12:53:53.409818 543705 cpu.go:275] no items to output this cycle
E0319 12:54:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:54:03.409789 543705 memory.go:184] no items to output this cycle
I0319 12:54:03.409804 543705 cpu.go:275] no items to output this cycle
E0319 12:54:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:54:13.409797 543705 memory.go:191] Add success.
I0319 12:54:13.409799 543705 cpu.go:282] Add success.
W0319 12:54:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:54:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:54:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:54:13.420146 543705 net.go:648] Add success.
I0319 12:54:13.422973 543705 net.go:770] primary dev: ETH0
I0319 12:54:13.422986 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:54:13.422998 543705 net.go:698] Add success.
I0319 12:54:13.469687 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"72e6cbf2-87ea-4f9a-ab0c-8344e2bd336c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 12:54:13.469721 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 12:54:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:54:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:54:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0319 12:54:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:54:14.456586 543705 disk_worker.go:494] system disk:vda1
I0319 12:54:14.456615 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:54:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:54:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:54:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:54:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:54:16.472391 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:54:19.233673 543705 disk_info.go:125] begin check local disk info of client
I0319 12:54:19.236048 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:54:19.236055 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a9240 0xc0004a9280]
E0319 12:54:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:54:23.409801 543705 memory.go:184] no items to output this cycle
I0319 12:54:23.409814 543705 cpu.go:275] no items to output this cycle
E0319 12:54:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:54:33.409893 543705 cpu.go:275] no items to output this cycle
I0319 12:54:33.409905 543705 memory.go:184] no items to output this cycle
I0319 12:54:37.713264 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 12:54:37.713271 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 12:54:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:54:43.410539 543705 memory.go:191] Add success.
I0319 12:54:43.409817 543705 cpu.go:282] Add success.
I0319 12:54:43.420314 543705 net.go:648] Add success.
I0319 12:54:43.422965 543705 net.go:770] primary dev: ETH0
I0319 12:54:43.422980 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:54:43.422995 543705 net.go:698] Add success.
I0319 12:54:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:54:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:54:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:54:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:54:53.409802 543705 memory.go:184] no items to output this cycle
I0319 12:54:53.409806 543705 cpu.go:275] no items to output this cycle
E0319 12:55:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:55:03.409784 543705 memory.go:184] no items to output this cycle
I0319 12:55:03.409787 543705 cpu.go:275] no items to output this cycle
E0319 12:55:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:55:13.409817 543705 memory.go:191] Add success.
I0319 12:55:13.409825 543705 cpu.go:282] Add success.
W0319 12:55:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:55:13.409876 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:55:13.409879 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:55:13.420272 543705 net.go:648] Add success.
I0319 12:55:13.423188 543705 net.go:770] primary dev: ETH0
I0319 12:55:13.423202 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:55:13.423214 543705 net.go:698] Add success.
I0319 12:55:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:55:14.455158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:55:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0319 12:55:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:55:14.456495 543705 disk_worker.go:494] system disk:vda1
I0319 12:55:14.456539 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:55:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:55:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:55:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:55:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:55:16.472383 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:55:19.237673 543705 disk_info.go:125] begin check local disk info of client
I0319 12:55:19.240051 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:55:19.240056 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003816c0 0xc000381700]
E0319 12:55:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:55:23.409776 543705 memory.go:184] no items to output this cycle
I0319 12:55:23.409779 543705 cpu.go:275] no items to output this cycle
E0319 12:55:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:55:33.409790 543705 memory.go:184] no items to output this cycle
I0319 12:55:33.409793 543705 cpu.go:275] no items to output this cycle
E0319 12:55:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:55:43.409801 543705 memory.go:191] Add success.
I0319 12:55:43.409803 543705 cpu.go:282] Add success.
I0319 12:55:43.420016 543705 net.go:648] Add success.
I0319 12:55:43.422998 543705 net.go:770] primary dev: ETH0
I0319 12:55:43.423011 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:55:43.423023 543705 net.go:698] Add success.
I0319 12:55:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:55:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:55:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:55:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:55:53.409769 543705 memory.go:184] no items to output this cycle
I0319 12:55:53.409797 543705 cpu.go:275] no items to output this cycle
E0319 12:56:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:56:03.409788 543705 memory.go:184] no items to output this cycle
I0319 12:56:03.409789 543705 cpu.go:275] no items to output this cycle
E0319 12:56:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:56:13.409816 543705 memory.go:191] Add success.
I0319 12:56:13.409824 543705 cpu.go:282] Add success.
W0319 12:56:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:56:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:56:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:56:13.420167 543705 net.go:648] Add success.
I0319 12:56:13.422818 543705 net.go:770] primary dev: ETH0
I0319 12:56:13.422831 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:56:13.422843 543705 net.go:698] Add success.
I0319 12:56:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:56:14.455188 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:56:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0319 12:56:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:56:14.456593 543705 disk_worker.go:494] system disk:vda1
I0319 12:56:14.456625 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:56:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:56:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:56:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:56:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:56:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:56:19.241675 543705 disk_info.go:125] begin check local disk info of client
I0319 12:56:19.244030 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:56:19.244035 543705 disk_info.go:196] parse disk info done, disk is : [0xc000517340 0xc000517380]
E0319 12:56:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:56:23.409798 543705 memory.go:184] no items to output this cycle
I0319 12:56:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 12:56:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:56:33.409809 543705 memory.go:184] no items to output this cycle
I0319 12:56:33.409824 543705 cpu.go:275] no items to output this cycle
E0319 12:56:43.409853 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:56:43.409959 543705 memory.go:191] Add success.
I0319 12:56:43.410087 543705 cpu.go:282] Add success.
I0319 12:56:43.419730 543705 net.go:648] Add success.
I0319 12:56:43.422737 543705 net.go:770] primary dev: ETH0
I0319 12:56:43.422750 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:56:43.422762 543705 net.go:698] Add success.
I0319 12:56:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:56:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:56:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:56:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:56:53.409765 543705 memory.go:184] no items to output this cycle
I0319 12:56:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 12:57:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:57:03.409795 543705 memory.go:184] no items to output this cycle
I0319 12:57:03.409797 543705 cpu.go:275] no items to output this cycle
E0319 12:57:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:57:13.409778 543705 memory.go:191] Add success.
W0319 12:57:13.409803 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 12:57:13.409809 543705 cpu.go:282] Add success.
W0319 12:57:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:57:13.409817 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:57:13.420142 543705 net.go:648] Add success.
I0319 12:57:13.423081 543705 net.go:770] primary dev: ETH0
I0319 12:57:13.423094 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:57:13.423105 543705 net.go:698] Add success.
I0319 12:57:13.429709 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 12:57:13.452892 543705 event_worker.go:152] Polling the log file for events...
I0319 12:57:13.463584 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"07898ae3-4395-4887-b39f-ae705e7ed561","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 12:57:13.463619 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 12:57:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:57:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0319 12:57:14.455177 543705 disk_worker.go:728] disk inode is not compliant
E0319 12:57:14.456971 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 12:57:14.456991 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 12:57:14.456997 543705 custom_config.go:64] query custom config with name: gpu
I0319 12:57:14.457019 543705 disk_worker.go:494] system disk:vda1
I0319 12:57:14.457059 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 12:57:15.456839 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 12:57:15.456848 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:57:16.457944 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 12:57:16.457955 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 12:57:16.458008 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:57:16.458026 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:57:16.472360 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:57:19.245673 543705 disk_info.go:125] begin check local disk info of client
I0319 12:57:19.248018 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:57:19.248024 543705 disk_info.go:196] parse disk info done, disk is : [0xc000292640 0xc000292680]
E0319 12:57:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:57:23.409787 543705 memory.go:184] no items to output this cycle
I0319 12:57:23.409802 543705 cpu.go:275] no items to output this cycle
I0319 12:57:33.409880 543705 cpu.go:275] no items to output this cycle
E0319 12:57:33.409881 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:57:33.409900 543705 memory.go:184] no items to output this cycle
I0319 12:57:37.713733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 12:57:37.713740 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 12:57:43.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:57:43.410623 543705 memory.go:191] Add success.
I0319 12:57:43.409813 543705 cpu.go:282] Add success.
I0319 12:57:43.420415 543705 net.go:648] Add success.
I0319 12:57:43.422981 543705 net.go:770] primary dev: ETH0
I0319 12:57:43.422994 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:57:43.423006 543705 net.go:698] Add success.
I0319 12:57:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:57:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:57:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:57:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:57:53.409794 543705 memory.go:184] no items to output this cycle
I0319 12:57:53.409806 543705 cpu.go:275] no items to output this cycle
E0319 12:58:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:58:03.409795 543705 memory.go:184] no items to output this cycle
I0319 12:58:03.409796 543705 cpu.go:275] no items to output this cycle
E0319 12:58:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:58:13.409801 543705 memory.go:191] Add success.
I0319 12:58:13.409802 543705 cpu.go:282] Add success.
W0319 12:58:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:58:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:58:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:58:13.420150 543705 net.go:648] Add success.
I0319 12:58:13.422873 543705 net.go:770] primary dev: ETH0
I0319 12:58:13.422887 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:58:13.422901 543705 net.go:698] Add success.
I0319 12:58:14.454953 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:58:14.455118 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:58:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0319 12:58:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:58:14.456601 543705 disk_worker.go:494] system disk:vda1
I0319 12:58:14.456634 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:58:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:58:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:58:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:58:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:58:16.472403 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:58:19.249671 543705 disk_info.go:125] begin check local disk info of client
I0319 12:58:19.252059 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:58:19.252065 543705 disk_info.go:196] parse disk info done, disk is : [0xc000396780 0xc0003967c0]
E0319 12:58:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:58:23.409787 543705 memory.go:184] no items to output this cycle
I0319 12:58:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 12:58:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:58:33.409810 543705 memory.go:184] no items to output this cycle
I0319 12:58:33.409822 543705 cpu.go:275] no items to output this cycle
E0319 12:58:43.409897 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:58:43.409926 543705 memory.go:191] Add success.
I0319 12:58:43.409949 543705 cpu.go:282] Add success.
I0319 12:58:43.419742 543705 net.go:648] Add success.
I0319 12:58:43.422371 543705 net.go:770] primary dev: ETH0
I0319 12:58:43.422385 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:58:43.422399 543705 net.go:698] Add success.
I0319 12:58:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:58:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:58:46.458056 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:58:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:58:53.409773 543705 memory.go:184] no items to output this cycle
I0319 12:58:53.409791 543705 cpu.go:275] no items to output this cycle
E0319 12:59:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:59:03.409776 543705 memory.go:184] no items to output this cycle
I0319 12:59:03.409805 543705 cpu.go:275] no items to output this cycle
E0319 12:59:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:59:13.409795 543705 memory.go:191] Add success.
I0319 12:59:13.409802 543705 cpu.go:282] Add success.
W0319 12:59:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 12:59:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 12:59:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 12:59:13.420080 543705 net.go:648] Add success.
I0319 12:59:13.422716 543705 net.go:770] primary dev: ETH0
I0319 12:59:13.422730 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:59:13.422742 543705 net.go:698] Add success.
I0319 12:59:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0319 12:59:14.455154 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 12:59:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0319 12:59:14.455170 543705 disk_worker.go:728] disk inode is not compliant
I0319 12:59:14.456491 543705 disk_worker.go:494] system disk:vda1
I0319 12:59:14.456533 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 12:59:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 12:59:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:59:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:59:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 12:59:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0319 12:59:19.253671 543705 disk_info.go:125] begin check local disk info of client
I0319 12:59:19.256096 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 12:59:19.256102 543705 disk_info.go:196] parse disk info done, disk is : [0xc000521f00 0xc000521f40]
E0319 12:59:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:59:23.409803 543705 memory.go:184] no items to output this cycle
I0319 12:59:23.409813 543705 cpu.go:275] no items to output this cycle
E0319 12:59:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:59:33.409803 543705 memory.go:184] no items to output this cycle
I0319 12:59:33.409815 543705 cpu.go:275] no items to output this cycle
E0319 12:59:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:59:43.409788 543705 memory.go:191] Add success.
I0319 12:59:43.409813 543705 cpu.go:282] Add success.
I0319 12:59:43.420264 543705 net.go:648] Add success.
I0319 12:59:43.423047 543705 net.go:770] primary dev: ETH0
I0319 12:59:43.423062 543705 net.go:802] Send network stats successfully!,count is 6
I0319 12:59:43.423075 543705 net.go:698] Add success.
I0319 12:59:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 12:59:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 12:59:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 12:59:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 12:59:53.409786 543705 memory.go:184] no items to output this cycle
I0319 12:59:53.409791 543705 cpu.go:275] no items to output this cycle
E0319 13:00:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:00:03.409793 543705 memory.go:184] no items to output this cycle
I0319 13:00:03.409802 543705 cpu.go:275] no items to output this cycle
E0319 13:00:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:00:13.409789 543705 memory.go:191] Add success.
W0319 13:00:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 13:00:13.409820 543705 cpu.go:282] Add success.
W0319 13:00:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:00:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:00:13.420185 543705 net.go:648] Add success.
I0319 13:00:13.423055 543705 net.go:770] primary dev: ETH0
I0319 13:00:13.423070 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:00:13.423082 543705 net.go:698] Add success.
I0319 13:00:13.470175 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"acc03c8c-ddeb-4782-ae1c-5fe149b3f274","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 13:00:13.470208 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 13:00:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:00:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:00:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0319 13:00:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:00:14.456489 543705 disk_worker.go:494] system disk:vda1
I0319 13:00:14.456534 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:00:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:00:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:00:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:00:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:00:16.472405 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:00:19.257681 543705 disk_info.go:125] begin check local disk info of client
I0319 13:00:19.260047 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:00:19.260053 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002be800 0xc0002be840]
E0319 13:00:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:00:23.409799 543705 memory.go:184] no items to output this cycle
I0319 13:00:23.409812 543705 cpu.go:275] no items to output this cycle
E0319 13:00:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:00:33.409792 543705 memory.go:184] no items to output this cycle
I0319 13:00:33.409814 543705 cpu.go:275] no items to output this cycle
I0319 13:00:37.713887 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 13:00:37.713894 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 13:00:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:00:43.410716 543705 memory.go:191] Add success.
I0319 13:00:43.409805 543705 cpu.go:282] Add success.
I0319 13:00:43.420622 543705 net.go:648] Add success.
I0319 13:00:43.423178 543705 net.go:770] primary dev: ETH0
I0319 13:00:43.423191 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:00:43.423203 543705 net.go:698] Add success.
I0319 13:00:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:00:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:00:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:00:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:00:53.409785 543705 memory.go:184] no items to output this cycle
I0319 13:00:53.409791 543705 cpu.go:275] no items to output this cycle
E0319 13:01:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:01:03.409783 543705 memory.go:184] no items to output this cycle
I0319 13:01:03.409806 543705 cpu.go:275] no items to output this cycle
E0319 13:01:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:01:13.409808 543705 memory.go:191] Add success.
I0319 13:01:13.409829 543705 cpu.go:282] Add success.
W0319 13:01:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:01:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:01:13.409854 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:01:13.420275 543705 net.go:648] Add success.
I0319 13:01:13.423414 543705 net.go:770] primary dev: ETH0
I0319 13:01:13.423427 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:01:13.423438 543705 net.go:698] Add success.
I0319 13:01:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:01:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:01:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0319 13:01:14.455176 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:01:14.456523 543705 disk_worker.go:494] system disk:vda1
I0319 13:01:14.456566 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:01:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:01:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:01:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:01:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:01:16.472404 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:01:19.261675 543705 disk_info.go:125] begin check local disk info of client
I0319 13:01:19.264035 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:01:19.264041 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002be9c0 0xc0002bea00]
E0319 13:01:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:01:23.409803 543705 memory.go:184] no items to output this cycle
I0319 13:01:23.409818 543705 cpu.go:275] no items to output this cycle
E0319 13:01:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:01:33.409795 543705 memory.go:184] no items to output this cycle
I0319 13:01:33.409811 543705 cpu.go:275] no items to output this cycle
E0319 13:01:43.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:01:43.409834 543705 memory.go:191] Add success.
I0319 13:01:43.409840 543705 cpu.go:282] Add success.
I0319 13:01:43.419765 543705 net.go:648] Add success.
I0319 13:01:43.422391 543705 net.go:770] primary dev: ETH0
I0319 13:01:43.422405 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:01:43.422419 543705 net.go:698] Add success.
I0319 13:01:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:01:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:01:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:01:53.410231 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:01:53.410247 543705 memory.go:184] no items to output this cycle
I0319 13:01:53.410279 543705 cpu.go:275] no items to output this cycle
E0319 13:02:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:02:03.409774 543705 memory.go:184] no items to output this cycle
I0319 13:02:03.409818 543705 cpu.go:275] no items to output this cycle
E0319 13:02:13.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:02:13.409777 543705 memory.go:191] Add success.
W0319 13:02:13.409803 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 13:02:13.409820 543705 cpu.go:282] Add success.
W0319 13:02:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:02:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:02:13.420159 543705 net.go:648] Add success.
I0319 13:02:13.423004 543705 net.go:770] primary dev: ETH0
I0319 13:02:13.423020 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:02:13.423033 543705 net.go:698] Add success.
W0319 13:02:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:02:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0319 13:02:14.455196 543705 disk_worker.go:728] disk inode is not compliant
E0319 13:02:14.455910 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 13:02:14.455919 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 13:02:14.455924 543705 custom_config.go:64] query custom config with name: gpu
I0319 13:02:14.456566 543705 disk_worker.go:494] system disk:vda1
I0319 13:02:14.456597 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 13:02:15.456851 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 13:02:15.456860 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:02:16.457943 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 13:02:16.457951 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 13:02:16.457996 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:02:16.458014 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:02:16.472348 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:02:19.265671 543705 disk_info.go:125] begin check local disk info of client
I0319 13:02:19.268067 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:02:19.268072 543705 disk_info.go:196] parse disk info done, disk is : [0xc000521a40 0xc000521a80]
E0319 13:02:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:02:23.409786 543705 memory.go:184] no items to output this cycle
I0319 13:02:23.409790 543705 cpu.go:275] no items to output this cycle
E0319 13:02:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:02:33.409787 543705 memory.go:184] no items to output this cycle
I0319 13:02:33.409794 543705 cpu.go:275] no items to output this cycle
E0319 13:02:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:02:43.409874 543705 memory.go:191] Add success.
I0319 13:02:43.409955 543705 cpu.go:282] Add success.
I0319 13:02:43.419731 543705 net.go:648] Add success.
I0319 13:02:43.422578 543705 net.go:770] primary dev: ETH0
I0319 13:02:43.422592 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:02:43.422606 543705 net.go:698] Add success.
I0319 13:02:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:02:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:02:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:02:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:02:53.409795 543705 memory.go:184] no items to output this cycle
I0319 13:02:53.409806 543705 cpu.go:275] no items to output this cycle
E0319 13:03:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:03:03.409810 543705 memory.go:184] no items to output this cycle
I0319 13:03:03.409822 543705 cpu.go:275] no items to output this cycle
E0319 13:03:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:03:13.409821 543705 memory.go:191] Add success.
I0319 13:03:13.409830 543705 cpu.go:282] Add success.
W0319 13:03:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:03:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:03:13.409886 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:03:13.420131 543705 net.go:648] Add success.
I0319 13:03:13.422943 543705 net.go:770] primary dev: ETH0
I0319 13:03:13.422957 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:03:13.422970 543705 net.go:698] Add success.
I0319 13:03:13.469183 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9b137a76-8dd6-44ee-a067-a0cf50c8f9b6","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 13:03:13.469215 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 13:03:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:03:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:03:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0319 13:03:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:03:14.456583 543705 disk_worker.go:494] system disk:vda1
I0319 13:03:14.456613 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:03:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:03:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:03:16.458025 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:03:16.458046 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:03:16.472369 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:03:19.269673 543705 disk_info.go:125] begin check local disk info of client
I0319 13:03:19.272089 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:03:19.272095 543705 disk_info.go:196] parse disk info done, disk is : [0xc000386100 0xc000386140]
E0319 13:03:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:03:23.409786 543705 memory.go:184] no items to output this cycle
I0319 13:03:23.409791 543705 cpu.go:275] no items to output this cycle
E0319 13:03:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:03:33.409769 543705 memory.go:184] no items to output this cycle
I0319 13:03:33.409798 543705 cpu.go:275] no items to output this cycle
I0319 13:03:37.715270 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 13:03:37.715277 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 13:03:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:03:43.410753 543705 memory.go:191] Add success.
I0319 13:03:43.409969 543705 cpu.go:282] Add success.
I0319 13:03:43.419716 543705 net.go:648] Add success.
I0319 13:03:43.422606 543705 net.go:770] primary dev: ETH0
I0319 13:03:43.422621 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:03:43.422636 543705 net.go:698] Add success.
I0319 13:03:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:03:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:03:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:03:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:03:53.409804 543705 memory.go:184] no items to output this cycle
I0319 13:03:53.409820 543705 cpu.go:275] no items to output this cycle
E0319 13:04:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:04:03.409792 543705 cpu.go:275] no items to output this cycle
I0319 13:04:03.409797 543705 memory.go:184] no items to output this cycle
E0319 13:04:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:04:13.409788 543705 memory.go:191] Add success.
I0319 13:04:13.409790 543705 cpu.go:282] Add success.
W0319 13:04:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:04:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:04:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:04:13.420090 543705 net.go:648] Add success.
I0319 13:04:13.423059 543705 net.go:770] primary dev: ETH0
I0319 13:04:13.423072 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:04:13.423086 543705 net.go:698] Add success.
I0319 13:04:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:04:14.455164 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:04:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0319 13:04:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:04:14.456519 543705 disk_worker.go:494] system disk:vda1
I0319 13:04:14.456563 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:04:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:04:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:04:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:04:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:04:16.472361 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:04:19.273673 543705 disk_info.go:125] begin check local disk info of client
I0319 13:04:19.276103 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:04:19.276110 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a9940 0xc0004a9980]
E0319 13:04:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:04:23.409774 543705 memory.go:184] no items to output this cycle
I0319 13:04:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 13:04:33.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:04:33.409815 543705 memory.go:184] no items to output this cycle
I0319 13:04:33.409826 543705 cpu.go:275] no items to output this cycle
E0319 13:04:43.409891 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:04:43.409961 543705 memory.go:191] Add success.
I0319 13:04:43.409965 543705 cpu.go:282] Add success.
I0319 13:04:43.419760 543705 net.go:648] Add success.
I0319 13:04:43.422556 543705 net.go:770] primary dev: ETH0
I0319 13:04:43.422571 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:04:43.422584 543705 net.go:698] Add success.
I0319 13:04:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:04:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:04:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:04:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:04:53.409776 543705 memory.go:184] no items to output this cycle
I0319 13:04:53.409783 543705 cpu.go:275] no items to output this cycle
E0319 13:05:03.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:05:03.409817 543705 memory.go:184] no items to output this cycle
I0319 13:05:03.409831 543705 cpu.go:275] no items to output this cycle
E0319 13:05:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:05:13.409777 543705 memory.go:191] Add success.
W0319 13:05:13.409803 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 13:05:13.409807 543705 cpu.go:282] Add success.
W0319 13:05:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:05:13.409817 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:05:13.420067 543705 net.go:648] Add success.
I0319 13:05:13.422831 543705 net.go:770] primary dev: ETH0
I0319 13:05:13.422844 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:05:13.422855 543705 net.go:698] Add success.
I0319 13:05:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:05:14.455126 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:05:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0319 13:05:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:05:14.456582 543705 disk_worker.go:494] system disk:vda1
I0319 13:05:14.456613 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:05:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:05:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:05:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:05:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:05:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:05:19.277673 543705 disk_info.go:125] begin check local disk info of client
I0319 13:05:19.280048 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:05:19.280055 543705 disk_info.go:196] parse disk info done, disk is : [0xc000521600 0xc000521640]
E0319 13:05:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:05:23.409804 543705 memory.go:184] no items to output this cycle
I0319 13:05:23.409815 543705 cpu.go:275] no items to output this cycle
E0319 13:05:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:05:33.409782 543705 memory.go:184] no items to output this cycle
I0319 13:05:33.409805 543705 cpu.go:275] no items to output this cycle
I0319 13:05:43.409960 543705 cpu.go:282] Add success.
E0319 13:05:43.409906 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:05:43.410074 543705 memory.go:191] Add success.
I0319 13:05:43.419708 543705 net.go:648] Add success.
I0319 13:05:43.422285 543705 net.go:770] primary dev: ETH0
I0319 13:05:43.422298 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:05:43.422310 543705 net.go:698] Add success.
I0319 13:05:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:05:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:05:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:05:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:05:53.409768 543705 memory.go:184] no items to output this cycle
I0319 13:05:53.409802 543705 cpu.go:275] no items to output this cycle
E0319 13:06:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:06:03.409783 543705 memory.go:184] no items to output this cycle
I0319 13:06:03.409790 543705 cpu.go:275] no items to output this cycle
E0319 13:06:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:06:13.409782 543705 memory.go:191] Add success.
W0319 13:06:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 13:06:13.409814 543705 cpu.go:282] Add success.
W0319 13:06:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:06:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:06:13.420156 543705 net.go:648] Add success.
I0319 13:06:13.422696 543705 net.go:770] primary dev: ETH0
I0319 13:06:13.422711 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:06:13.422725 543705 net.go:698] Add success.
I0319 13:06:13.565265 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f78e7bd2-c4e7-4585-986d-723e5a845070","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 13:06:13.565298 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 13:06:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:06:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:06:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0319 13:06:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:06:14.456742 543705 disk_worker.go:494] system disk:vda1
I0319 13:06:14.456771 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:06:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:06:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:06:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:06:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:06:16.472427 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:06:19.281678 543705 disk_info.go:125] begin check local disk info of client
I0319 13:06:19.284018 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:06:19.284028 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bff40 0xc00032a000]
E0319 13:06:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:06:23.409781 543705 memory.go:184] no items to output this cycle
I0319 13:06:23.409785 543705 cpu.go:275] no items to output this cycle
E0319 13:06:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:06:33.409788 543705 memory.go:184] no items to output this cycle
I0319 13:06:33.409797 543705 cpu.go:275] no items to output this cycle
I0319 13:06:37.715422 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 13:06:37.715429 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 13:06:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:06:43.410657 543705 memory.go:191] Add success.
I0319 13:06:43.409804 543705 cpu.go:282] Add success.
I0319 13:06:43.420362 543705 net.go:648] Add success.
I0319 13:06:43.422873 543705 net.go:770] primary dev: ETH0
I0319 13:06:43.422885 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:06:43.422898 543705 net.go:698] Add success.
I0319 13:06:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:06:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:06:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:06:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:06:53.409766 543705 memory.go:184] no items to output this cycle
I0319 13:06:53.409798 543705 cpu.go:275] no items to output this cycle
E0319 13:07:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:07:03.409807 543705 memory.go:184] no items to output this cycle
I0319 13:07:03.409823 543705 cpu.go:275] no items to output this cycle
E0319 13:07:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:07:13.409783 543705 memory.go:191] Add success.
I0319 13:07:13.409803 543705 cpu.go:282] Add success.
W0319 13:07:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:07:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:07:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:07:13.420047 543705 net.go:648] Add success.
I0319 13:07:13.422888 543705 net.go:770] primary dev: ETH0
I0319 13:07:13.422902 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:07:13.422917 543705 net.go:698] Add success.
I0319 13:07:13.452780 543705 event_worker.go:152] Polling the log file for events...
W0319 13:07:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:07:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0319 13:07:14.455194 543705 disk_worker.go:728] disk inode is not compliant
E0319 13:07:14.455891 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 13:07:14.455899 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 13:07:14.455905 543705 custom_config.go:64] query custom config with name: gpu
I0319 13:07:14.456553 543705 disk_worker.go:494] system disk:vda1
I0319 13:07:14.456584 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 13:07:15.456813 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 13:07:15.456821 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:07:16.457955 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 13:07:16.457954 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 13:07:16.458007 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:07:16.458030 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:07:16.472377 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:07:19.285677 543705 disk_info.go:125] begin check local disk info of client
I0319 13:07:19.288130 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:07:19.288136 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b0080 0xc0003b00c0]
E0319 13:07:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:07:23.409796 543705 memory.go:184] no items to output this cycle
I0319 13:07:23.409811 543705 cpu.go:275] no items to output this cycle
E0319 13:07:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:07:33.409866 543705 memory.go:184] no items to output this cycle
I0319 13:07:33.409923 543705 cpu.go:275] no items to output this cycle
E0319 13:07:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:07:43.409783 543705 memory.go:191] Add success.
I0319 13:07:43.409818 543705 cpu.go:282] Add success.
I0319 13:07:43.420035 543705 net.go:648] Add success.
I0319 13:07:43.422878 543705 net.go:770] primary dev: ETH0
I0319 13:07:43.422892 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:07:43.422905 543705 net.go:698] Add success.
I0319 13:07:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:07:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:07:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:07:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:07:53.409767 543705 memory.go:184] no items to output this cycle
I0319 13:07:53.409801 543705 cpu.go:275] no items to output this cycle
E0319 13:08:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:08:03.409779 543705 memory.go:184] no items to output this cycle
I0319 13:08:03.409818 543705 cpu.go:275] no items to output this cycle
E0319 13:08:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:08:13.409828 543705 memory.go:191] Add success.
I0319 13:08:13.409843 543705 cpu.go:282] Add success.
W0319 13:08:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:08:13.409873 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:08:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:08:13.420290 543705 net.go:648] Add success.
I0319 13:08:13.423120 543705 net.go:770] primary dev: ETH0
I0319 13:08:13.423134 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:08:13.423146 543705 net.go:698] Add success.
I0319 13:08:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:08:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:08:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0319 13:08:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:08:14.456607 543705 disk_worker.go:494] system disk:vda1
I0319 13:08:14.456638 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:08:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:08:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:08:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:08:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:08:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:08:19.289676 543705 disk_info.go:125] begin check local disk info of client
I0319 13:08:19.292104 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:08:19.292111 543705 disk_info.go:196] parse disk info done, disk is : [0xc000290640 0xc000290680]
E0319 13:08:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:08:23.409802 543705 memory.go:184] no items to output this cycle
I0319 13:08:23.409811 543705 cpu.go:275] no items to output this cycle
E0319 13:08:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:08:33.409780 543705 memory.go:184] no items to output this cycle
I0319 13:08:33.409805 543705 cpu.go:275] no items to output this cycle
E0319 13:08:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:08:43.409780 543705 memory.go:191] Add success.
I0319 13:08:43.409804 543705 cpu.go:282] Add success.
I0319 13:08:43.419989 543705 net.go:648] Add success.
I0319 13:08:43.422969 543705 net.go:770] primary dev: ETH0
I0319 13:08:43.422985 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:08:43.422999 543705 net.go:698] Add success.
I0319 13:08:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:08:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:08:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:08:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:08:53.409779 543705 memory.go:184] no items to output this cycle
I0319 13:08:53.409784 543705 cpu.go:275] no items to output this cycle
E0319 13:09:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:09:03.409784 543705 memory.go:184] no items to output this cycle
I0319 13:09:03.409805 543705 cpu.go:275] no items to output this cycle
E0319 13:09:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:09:13.409817 543705 memory.go:191] Add success.
I0319 13:09:13.409822 543705 cpu.go:282] Add success.
W0319 13:09:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:09:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:09:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:09:13.420401 543705 net.go:648] Add success.
I0319 13:09:13.423156 543705 net.go:770] primary dev: ETH0
I0319 13:09:13.423169 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:09:13.423181 543705 net.go:698] Add success.
I0319 13:09:13.624332 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"19e3b232-2629-45fb-acc3-5dfeb7c45501","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 13:09:13.624368 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 13:09:14.453987 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:09:14.454204 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:09:14.454217 543705 disk_worker.go:708] disk space is not compliant
W0319 13:09:14.454220 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:09:14.455606 543705 disk_worker.go:494] system disk:vda1
I0319 13:09:14.455663 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:09:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:09:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:09:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:09:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:09:16.472527 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:09:19.293676 543705 disk_info.go:125] begin check local disk info of client
I0319 13:09:19.296138 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:09:19.296145 543705 disk_info.go:196] parse disk info done, disk is : [0xc00029a340 0xc00029a380]
E0319 13:09:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:09:23.409777 543705 memory.go:184] no items to output this cycle
I0319 13:09:23.409784 543705 cpu.go:275] no items to output this cycle
E0319 13:09:33.409864 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:09:33.409882 543705 memory.go:184] no items to output this cycle
I0319 13:09:33.409964 543705 cpu.go:275] no items to output this cycle
I0319 13:09:37.716272 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 13:09:37.716279 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 13:09:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:09:43.410680 543705 memory.go:191] Add success.
I0319 13:09:43.409805 543705 cpu.go:282] Add success.
I0319 13:09:43.420403 543705 net.go:648] Add success.
I0319 13:09:43.423115 543705 net.go:770] primary dev: ETH0
I0319 13:09:43.423129 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:09:43.423143 543705 net.go:698] Add success.
I0319 13:09:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:09:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:09:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:09:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:09:53.409770 543705 memory.go:184] no items to output this cycle
I0319 13:09:53.409778 543705 cpu.go:275] no items to output this cycle
E0319 13:10:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:10:03.409783 543705 memory.go:184] no items to output this cycle
I0319 13:10:03.409799 543705 cpu.go:275] no items to output this cycle
E0319 13:10:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:10:13.409791 543705 memory.go:191] Add success.
I0319 13:10:13.409795 543705 cpu.go:282] Add success.
W0319 13:10:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:10:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:10:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:10:13.420051 543705 net.go:648] Add success.
I0319 13:10:13.422877 543705 net.go:770] primary dev: ETH0
I0319 13:10:13.422892 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:10:13.422904 543705 net.go:698] Add success.
I0319 13:10:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:10:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:10:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 13:10:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:10:14.456586 543705 disk_worker.go:494] system disk:vda1
I0319 13:10:14.456614 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:10:15.455979 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:10:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:10:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:10:16.458074 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:10:16.472515 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:10:19.297674 543705 disk_info.go:125] begin check local disk info of client
I0319 13:10:19.300121 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:10:19.300127 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002af540 0xc0002af580]
E0319 13:10:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:10:23.409803 543705 memory.go:184] no items to output this cycle
I0319 13:10:23.409811 543705 cpu.go:275] no items to output this cycle
E0319 13:10:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:10:33.409786 543705 cpu.go:275] no items to output this cycle
I0319 13:10:33.409793 543705 memory.go:184] no items to output this cycle
E0319 13:10:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:10:43.409801 543705 memory.go:191] Add success.
I0319 13:10:43.409803 543705 cpu.go:282] Add success.
I0319 13:10:43.420023 543705 net.go:648] Add success.
I0319 13:10:43.422701 543705 net.go:770] primary dev: ETH0
I0319 13:10:43.422715 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:10:43.422727 543705 net.go:698] Add success.
I0319 13:10:46.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:10:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:10:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:10:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:10:53.409791 543705 memory.go:184] no items to output this cycle
I0319 13:10:53.409800 543705 cpu.go:275] no items to output this cycle
E0319 13:11:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:11:03.409775 543705 memory.go:184] no items to output this cycle
I0319 13:11:03.409800 543705 cpu.go:275] no items to output this cycle
E0319 13:11:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:11:13.409809 543705 memory.go:191] Add success.
I0319 13:11:13.409818 543705 cpu.go:282] Add success.
W0319 13:11:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:11:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:11:13.409860 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:11:13.420135 543705 net.go:648] Add success.
I0319 13:11:13.422982 543705 net.go:770] primary dev: ETH0
I0319 13:11:13.423001 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:11:13.423016 543705 net.go:698] Add success.
I0319 13:11:14.454951 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:11:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:11:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0319 13:11:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:11:14.456574 543705 disk_worker.go:494] system disk:vda1
I0319 13:11:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:11:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:11:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:11:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:11:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:11:16.472495 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:11:19.301674 543705 disk_info.go:125] begin check local disk info of client
I0319 13:11:19.304150 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:11:19.304156 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b6e80 0xc0003b6ec0]
E0319 13:11:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:11:23.409775 543705 memory.go:184] no items to output this cycle
I0319 13:11:23.409782 543705 cpu.go:275] no items to output this cycle
E0319 13:11:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:11:33.409770 543705 memory.go:184] no items to output this cycle
I0319 13:11:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 13:11:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:11:43.409904 543705 cpu.go:282] Add success.
I0319 13:11:43.409921 543705 memory.go:191] Add success.
I0319 13:11:43.419745 543705 net.go:648] Add success.
I0319 13:11:43.422881 543705 net.go:770] primary dev: ETH0
I0319 13:11:43.422895 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:11:43.422908 543705 net.go:698] Add success.
I0319 13:11:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:11:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:11:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:11:53.410733 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:11:53.410753 543705 memory.go:184] no items to output this cycle
I0319 13:11:53.410764 543705 cpu.go:275] no items to output this cycle
E0319 13:12:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:12:03.409807 543705 memory.go:184] no items to output this cycle
I0319 13:12:03.409821 543705 cpu.go:275] no items to output this cycle
E0319 13:12:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:12:13.409791 543705 cpu.go:282] Add success.
I0319 13:12:13.409801 543705 memory.go:191] Add success.
W0319 13:12:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:12:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:12:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:12:13.420299 543705 net.go:648] Add success.
I0319 13:12:13.422992 543705 net.go:770] primary dev: ETH0
I0319 13:12:13.423005 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:12:13.423018 543705 net.go:698] Add success.
I0319 13:12:13.469705 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0c2bd783-78ee-4f30-9602-0684e7e5af0b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 13:12:13.469741 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 13:12:14.455153 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:12:14.455223 543705 disk_worker.go:708] disk space is not compliant
W0319 13:12:14.455229 543705 disk_worker.go:728] disk inode is not compliant
E0319 13:12:14.455991 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 13:12:14.456000 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 13:12:14.456006 543705 custom_config.go:64] query custom config with name: gpu
I0319 13:12:14.456846 543705 disk_worker.go:494] system disk:vda1
I0319 13:12:14.456880 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 13:12:15.456859 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 13:12:15.456867 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:12:16.457921 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 13:12:16.457929 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 13:12:16.457973 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:12:16.457992 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:12:16.472357 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:12:19.305671 543705 disk_info.go:125] begin check local disk info of client
I0319 13:12:19.308006 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:12:19.308012 543705 disk_info.go:196] parse disk info done, disk is : [0xc00024d240 0xc00024d280]
E0319 13:12:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:12:23.409797 543705 memory.go:184] no items to output this cycle
I0319 13:12:23.409810 543705 cpu.go:275] no items to output this cycle
E0319 13:12:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:12:33.409795 543705 cpu.go:275] no items to output this cycle
I0319 13:12:33.409796 543705 memory.go:184] no items to output this cycle
I0319 13:12:37.717284 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 13:12:37.717291 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 13:12:43.409938 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:12:43.410809 543705 memory.go:191] Add success.
I0319 13:12:43.409993 543705 cpu.go:282] Add success.
I0319 13:12:43.419829 543705 net.go:648] Add success.
I0319 13:12:43.422638 543705 net.go:770] primary dev: ETH0
I0319 13:12:43.422651 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:12:43.422662 543705 net.go:698] Add success.
I0319 13:12:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:12:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:12:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:12:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:12:53.409772 543705 memory.go:184] no items to output this cycle
I0319 13:12:53.409782 543705 cpu.go:275] no items to output this cycle
E0319 13:13:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:13:03.409781 543705 memory.go:184] no items to output this cycle
I0319 13:13:03.409814 543705 cpu.go:275] no items to output this cycle
E0319 13:13:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:13:13.409809 543705 memory.go:191] Add success.
I0319 13:13:13.409808 543705 cpu.go:282] Add success.
W0319 13:13:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:13:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:13:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:13:13.420077 543705 net.go:648] Add success.
I0319 13:13:13.423286 543705 net.go:770] primary dev: ETH0
I0319 13:13:13.423299 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:13:13.423311 543705 net.go:698] Add success.
I0319 13:13:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:13:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:13:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0319 13:13:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:13:14.456597 543705 disk_worker.go:494] system disk:vda1
I0319 13:13:14.456628 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:13:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:13:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:13:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:13:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:13:16.472380 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:13:19.309673 543705 disk_info.go:125] begin check local disk info of client
I0319 13:13:19.312053 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:13:19.312062 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b7700 0xc0003b7740]
E0319 13:13:23.410373 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:13:23.410390 543705 memory.go:184] no items to output this cycle
I0319 13:13:23.410400 543705 cpu.go:275] no items to output this cycle
E0319 13:13:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:13:33.409766 543705 memory.go:184] no items to output this cycle
I0319 13:13:33.409806 543705 cpu.go:275] no items to output this cycle
E0319 13:13:43.409894 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:13:43.409930 543705 memory.go:191] Add success.
I0319 13:13:43.410119 543705 cpu.go:282] Add success.
I0319 13:13:43.419713 543705 net.go:648] Add success.
I0319 13:13:43.422378 543705 net.go:770] primary dev: ETH0
I0319 13:13:43.422391 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:13:43.422403 543705 net.go:698] Add success.
I0319 13:13:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:13:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:13:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:13:53.410245 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:13:53.410262 543705 memory.go:184] no items to output this cycle
I0319 13:13:53.410282 543705 cpu.go:275] no items to output this cycle
E0319 13:14:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:14:03.409793 543705 memory.go:184] no items to output this cycle
I0319 13:14:03.409796 543705 cpu.go:275] no items to output this cycle
E0319 13:14:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:14:13.409799 543705 memory.go:191] Add success.
I0319 13:14:13.409799 543705 cpu.go:282] Add success.
W0319 13:14:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:14:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:14:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:14:13.420323 543705 net.go:648] Add success.
I0319 13:14:13.422949 543705 net.go:770] primary dev: ETH0
I0319 13:14:13.422966 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:14:13.422981 543705 net.go:698] Add success.
I0319 13:14:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:14:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:14:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0319 13:14:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:14:14.456505 543705 disk_worker.go:494] system disk:vda1
I0319 13:14:14.456548 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:14:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:14:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:14:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:14:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:14:16.472394 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:14:19.313671 543705 disk_info.go:125] begin check local disk info of client
I0319 13:14:19.316105 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:14:19.316110 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b7100 0xc0003b7140]
E0319 13:14:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:14:23.409799 543705 memory.go:184] no items to output this cycle
I0319 13:14:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 13:14:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:14:33.409774 543705 memory.go:184] no items to output this cycle
I0319 13:14:33.409815 543705 cpu.go:275] no items to output this cycle
E0319 13:14:43.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:14:43.409830 543705 memory.go:191] Add success.
I0319 13:14:43.409839 543705 cpu.go:282] Add success.
I0319 13:14:43.420050 543705 net.go:648] Add success.
I0319 13:14:43.422742 543705 net.go:770] primary dev: ETH0
I0319 13:14:43.422755 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:14:43.422767 543705 net.go:698] Add success.
I0319 13:14:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:14:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:14:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:14:53.410410 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:14:53.410426 543705 memory.go:184] no items to output this cycle
I0319 13:14:53.410449 543705 cpu.go:275] no items to output this cycle
E0319 13:15:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:15:03.409783 543705 memory.go:184] no items to output this cycle
I0319 13:15:03.409813 543705 cpu.go:275] no items to output this cycle
E0319 13:15:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:15:13.409819 543705 memory.go:191] Add success.
I0319 13:15:13.409830 543705 cpu.go:282] Add success.
W0319 13:15:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:15:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:15:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:15:13.420259 543705 net.go:648] Add success.
I0319 13:15:13.423106 543705 net.go:770] primary dev: ETH0
I0319 13:15:13.423118 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:15:13.423130 543705 net.go:698] Add success.
I0319 13:15:13.468928 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3b5130d5-9708-45bc-813d-9944e2d79716","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 13:15:13.468962 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 13:15:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:15:14.455107 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:15:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0319 13:15:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:15:14.456678 543705 disk_worker.go:494] system disk:vda1
I0319 13:15:14.456707 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:15:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:15:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:15:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:15:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:15:16.472393 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:15:19.317676 543705 disk_info.go:125] begin check local disk info of client
I0319 13:15:19.320045 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:15:19.320051 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001abe80 0xc0001abec0]
E0319 13:15:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:15:23.409796 543705 memory.go:184] no items to output this cycle
I0319 13:15:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 13:15:33.409890 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:15:33.409908 543705 memory.go:184] no items to output this cycle
I0319 13:15:33.410014 543705 cpu.go:275] no items to output this cycle
I0319 13:15:37.717731 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 13:15:37.717738 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 13:15:43.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:15:43.410686 543705 memory.go:191] Add success.
I0319 13:15:43.409840 543705 cpu.go:282] Add success.
I0319 13:15:43.420371 543705 net.go:648] Add success.
I0319 13:15:43.423213 543705 net.go:770] primary dev: ETH0
I0319 13:15:43.423226 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:15:43.423239 543705 net.go:698] Add success.
I0319 13:15:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:15:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:15:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:15:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:15:53.409770 543705 memory.go:184] no items to output this cycle
I0319 13:15:53.409798 543705 cpu.go:275] no items to output this cycle
E0319 13:16:03.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:16:03.409813 543705 memory.go:184] no items to output this cycle
I0319 13:16:03.409825 543705 cpu.go:275] no items to output this cycle
E0319 13:16:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:16:13.409783 543705 memory.go:191] Add success.
W0319 13:16:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 13:16:13.409811 543705 cpu.go:282] Add success.
W0319 13:16:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:16:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:16:13.420399 543705 net.go:648] Add success.
I0319 13:16:13.423125 543705 net.go:770] primary dev: ETH0
I0319 13:16:13.423138 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:16:13.423150 543705 net.go:698] Add success.
I0319 13:16:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:16:14.455169 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:16:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0319 13:16:14.455182 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:16:14.456523 543705 disk_worker.go:494] system disk:vda1
I0319 13:16:14.456566 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:16:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:16:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:16:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:16:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:16:16.472385 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:16:19.321673 543705 disk_info.go:125] begin check local disk info of client
I0319 13:16:19.324040 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:16:19.324046 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003bf3c0 0xc0003bf400]
E0319 13:16:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:16:23.409805 543705 memory.go:184] no items to output this cycle
I0319 13:16:23.409808 543705 cpu.go:275] no items to output this cycle
E0319 13:16:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:16:33.409795 543705 memory.go:184] no items to output this cycle
I0319 13:16:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 13:16:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:16:43.409788 543705 memory.go:191] Add success.
I0319 13:16:43.409814 543705 cpu.go:282] Add success.
I0319 13:16:43.419990 543705 net.go:648] Add success.
I0319 13:16:43.422815 543705 net.go:770] primary dev: ETH0
I0319 13:16:43.422830 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:16:43.422845 543705 net.go:698] Add success.
I0319 13:16:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:16:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:16:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:16:53.410382 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:16:53.410401 543705 memory.go:184] no items to output this cycle
I0319 13:16:53.410416 543705 cpu.go:275] no items to output this cycle
E0319 13:17:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:17:03.409788 543705 memory.go:184] no items to output this cycle
I0319 13:17:03.409808 543705 cpu.go:275] no items to output this cycle
E0319 13:17:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:17:13.409820 543705 memory.go:191] Add success.
I0319 13:17:13.409832 543705 cpu.go:282] Add success.
W0319 13:17:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:17:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:17:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:17:13.420185 543705 net.go:648] Add success.
I0319 13:17:13.423012 543705 net.go:770] primary dev: ETH0
I0319 13:17:13.423027 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:17:13.423042 543705 net.go:698] Add success.
I0319 13:17:13.453610 543705 event_worker.go:152] Polling the log file for events...
W0319 13:17:14.455128 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:17:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0319 13:17:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:17:14.456778 543705 disk_worker.go:494] system disk:vda1
I0319 13:17:14.456821 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 13:17:14.457139 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 13:17:14.457146 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 13:17:14.457151 543705 custom_config.go:64] query custom config with name: gpu
E0319 13:17:15.456857 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 13:17:15.456866 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:17:16.457922 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 13:17:16.457941 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 13:17:16.457974 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:17:16.457990 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:17:16.472346 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:17:19.327012 543705 disk_info.go:125] begin check local disk info of client
I0319 13:17:19.329372 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:17:19.329378 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003be980 0xc0003be9c0]
E0319 13:17:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:17:23.409758 543705 memory.go:184] no items to output this cycle
I0319 13:17:23.409797 543705 cpu.go:275] no items to output this cycle
E0319 13:17:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:17:33.409780 543705 memory.go:184] no items to output this cycle
I0319 13:17:33.409810 543705 cpu.go:275] no items to output this cycle
E0319 13:17:43.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:17:43.409827 543705 memory.go:191] Add success.
I0319 13:17:43.409828 543705 cpu.go:282] Add success.
I0319 13:17:43.420008 543705 net.go:648] Add success.
I0319 13:17:43.422895 543705 net.go:770] primary dev: ETH0
I0319 13:17:43.422908 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:17:43.422921 543705 net.go:698] Add success.
I0319 13:17:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:17:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:17:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:17:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:17:53.409774 543705 memory.go:184] no items to output this cycle
I0319 13:17:53.409780 543705 cpu.go:275] no items to output this cycle
E0319 13:18:03.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:18:03.409812 543705 memory.go:184] no items to output this cycle
I0319 13:18:03.409823 543705 cpu.go:275] no items to output this cycle
E0319 13:18:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:18:13.409794 543705 memory.go:191] Add success.
I0319 13:18:13.409815 543705 cpu.go:282] Add success.
W0319 13:18:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:18:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:18:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:18:13.420161 543705 net.go:648] Add success.
I0319 13:18:13.423025 543705 net.go:770] primary dev: ETH0
I0319 13:18:13.423040 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:18:13.423055 543705 net.go:698] Add success.
I0319 13:18:13.533607 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f560c095-4868-4073-b481-b1009afb0d87","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 13:18:13.533660 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 13:18:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:18:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:18:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0319 13:18:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:18:14.456526 543705 disk_worker.go:494] system disk:vda1
I0319 13:18:14.456582 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:18:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:18:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:18:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:18:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:18:16.472364 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:18:19.329675 543705 disk_info.go:125] begin check local disk info of client
I0319 13:18:19.332048 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:18:19.332056 543705 disk_info.go:196] parse disk info done, disk is : [0xc000474000 0xc000474040]
E0319 13:18:23.409857 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:18:23.409874 543705 memory.go:184] no items to output this cycle
I0319 13:18:23.409949 543705 cpu.go:275] no items to output this cycle
E0319 13:18:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:18:33.409773 543705 memory.go:184] no items to output this cycle
I0319 13:18:33.409815 543705 cpu.go:275] no items to output this cycle
I0319 13:18:37.719284 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 13:18:37.719291 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 13:18:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:18:43.410761 543705 memory.go:191] Add success.
I0319 13:18:43.409804 543705 cpu.go:282] Add success.
I0319 13:18:43.420523 543705 net.go:648] Add success.
I0319 13:18:43.423171 543705 net.go:770] primary dev: ETH0
I0319 13:18:43.423186 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:18:43.423200 543705 net.go:698] Add success.
I0319 13:18:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:18:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:18:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:18:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:18:53.409768 543705 memory.go:184] no items to output this cycle
I0319 13:18:53.409800 543705 cpu.go:275] no items to output this cycle
E0319 13:19:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:19:03.409780 543705 memory.go:184] no items to output this cycle
I0319 13:19:03.409816 543705 cpu.go:275] no items to output this cycle
E0319 13:19:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:19:13.409802 543705 memory.go:191] Add success.
I0319 13:19:13.409804 543705 cpu.go:282] Add success.
W0319 13:19:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:19:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:19:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:19:13.420124 543705 net.go:648] Add success.
I0319 13:19:13.422874 543705 net.go:770] primary dev: ETH0
I0319 13:19:13.422889 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:19:13.422904 543705 net.go:698] Add success.
I0319 13:19:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:19:14.455105 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:19:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0319 13:19:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:19:14.456578 543705 disk_worker.go:494] system disk:vda1
I0319 13:19:14.456610 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:19:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:19:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:19:16.458025 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:19:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:19:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:19:19.333669 543705 disk_info.go:125] begin check local disk info of client
I0319 13:19:19.336082 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:19:19.336088 543705 disk_info.go:196] parse disk info done, disk is : [0xc000304ac0 0xc000304b00]
E0319 13:19:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:19:23.409776 543705 cpu.go:275] no items to output this cycle
I0319 13:19:23.409778 543705 memory.go:184] no items to output this cycle
E0319 13:19:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:19:33.409768 543705 memory.go:184] no items to output this cycle
I0319 13:19:33.409812 543705 cpu.go:275] no items to output this cycle
E0319 13:19:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:19:43.409821 543705 memory.go:191] Add success.
I0319 13:19:43.409835 543705 cpu.go:282] Add success.
I0319 13:19:43.420232 543705 net.go:648] Add success.
I0319 13:19:43.423134 543705 net.go:770] primary dev: ETH0
I0319 13:19:43.423147 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:19:43.423159 543705 net.go:698] Add success.
I0319 13:19:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:19:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:19:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:19:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:19:53.409768 543705 memory.go:184] no items to output this cycle
I0319 13:19:53.409814 543705 cpu.go:275] no items to output this cycle
E0319 13:20:03.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:20:03.409817 543705 memory.go:184] no items to output this cycle
I0319 13:20:03.409836 543705 cpu.go:275] no items to output this cycle
E0319 13:20:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:20:13.409798 543705 memory.go:191] Add success.
I0319 13:20:13.409804 543705 cpu.go:282] Add success.
W0319 13:20:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:20:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:20:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:20:13.420096 543705 net.go:648] Add success.
I0319 13:20:13.422784 543705 net.go:770] primary dev: ETH0
I0319 13:20:13.422798 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:20:13.422813 543705 net.go:698] Add success.
I0319 13:20:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:20:14.455137 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:20:14.455147 543705 disk_worker.go:708] disk space is not compliant
W0319 13:20:14.455149 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:20:14.456478 543705 disk_worker.go:494] system disk:vda1
I0319 13:20:14.456534 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:20:15.456011 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:20:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:20:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:20:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:20:16.472412 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:20:19.337671 543705 disk_info.go:125] begin check local disk info of client
I0319 13:20:19.340117 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:20:19.340124 543705 disk_info.go:196] parse disk info done, disk is : [0xc00028a400 0xc00028a440]
E0319 13:20:23.410510 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:20:23.410547 543705 memory.go:184] no items to output this cycle
I0319 13:20:23.410588 543705 cpu.go:275] no items to output this cycle
E0319 13:20:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:20:33.409778 543705 memory.go:184] no items to output this cycle
I0319 13:20:33.409799 543705 cpu.go:275] no items to output this cycle
E0319 13:20:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:20:43.409814 543705 memory.go:191] Add success.
I0319 13:20:43.409822 543705 cpu.go:282] Add success.
I0319 13:20:43.419955 543705 net.go:648] Add success.
I0319 13:20:43.422863 543705 net.go:770] primary dev: ETH0
I0319 13:20:43.422878 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:20:43.422893 543705 net.go:698] Add success.
I0319 13:20:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:20:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:20:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:20:53.410357 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:20:53.410370 543705 cpu.go:275] no items to output this cycle
I0319 13:20:53.410373 543705 memory.go:184] no items to output this cycle
E0319 13:21:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:21:03.409782 543705 memory.go:184] no items to output this cycle
I0319 13:21:03.409815 543705 cpu.go:275] no items to output this cycle
E0319 13:21:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:21:13.409825 543705 memory.go:191] Add success.
I0319 13:21:13.409834 543705 cpu.go:282] Add success.
W0319 13:21:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:21:13.409873 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:21:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:21:13.420128 543705 net.go:648] Add success.
I0319 13:21:13.422964 543705 net.go:770] primary dev: ETH0
I0319 13:21:13.422976 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:21:13.422988 543705 net.go:698] Add success.
I0319 13:21:13.464157 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e19901d8-0b58-4e1b-b34f-093570fe7f7a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 13:21:13.464189 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 13:21:14.454984 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:21:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:21:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0319 13:21:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:21:14.456504 543705 disk_worker.go:494] system disk:vda1
I0319 13:21:14.456529 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:21:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:21:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:21:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:21:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:21:16.472412 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:21:19.341675 543705 disk_info.go:125] begin check local disk info of client
I0319 13:21:19.344039 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:21:19.344045 543705 disk_info.go:196] parse disk info done, disk is : [0xc00036a840 0xc00036a880]
E0319 13:21:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:21:23.409788 543705 memory.go:184] no items to output this cycle
I0319 13:21:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 13:21:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:21:33.409783 543705 memory.go:184] no items to output this cycle
I0319 13:21:33.409797 543705 cpu.go:275] no items to output this cycle
I0319 13:21:37.720302 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 13:21:37.720309 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 13:21:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:21:43.410768 543705 memory.go:191] Add success.
I0319 13:21:43.409802 543705 cpu.go:282] Add success.
I0319 13:21:43.420476 543705 net.go:648] Add success.
I0319 13:21:43.423038 543705 net.go:770] primary dev: ETH0
I0319 13:21:43.423053 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:21:43.423067 543705 net.go:698] Add success.
I0319 13:21:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:21:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:21:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:21:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:21:53.409764 543705 memory.go:184] no items to output this cycle
I0319 13:21:53.409811 543705 cpu.go:275] no items to output this cycle
E0319 13:22:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:22:03.409794 543705 memory.go:184] no items to output this cycle
I0319 13:22:03.409811 543705 cpu.go:275] no items to output this cycle
E0319 13:22:13.409880 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:22:13.409911 543705 memory.go:191] Add success.
W0319 13:22:13.409940 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:22:13.409952 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:22:13.409960 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:22:13.410188 543705 cpu.go:282] Add success.
I0319 13:22:13.419717 543705 net.go:648] Add success.
I0319 13:22:13.422355 543705 net.go:770] primary dev: ETH0
I0319 13:22:13.422368 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:22:13.422379 543705 net.go:698] Add success.
W0319 13:22:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:22:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0319 13:22:14.455173 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:22:14.456781 543705 disk_worker.go:494] system disk:vda1
I0319 13:22:14.456819 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 13:22:14.457100 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 13:22:14.457108 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 13:22:14.457112 543705 custom_config.go:64] query custom config with name: gpu
E0319 13:22:15.456822 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 13:22:15.456831 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:22:16.457930 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 13:22:16.457930 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 13:22:16.457986 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:22:16.458006 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:22:16.472323 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:22:19.345682 543705 disk_info.go:125] begin check local disk info of client
I0319 13:22:19.348005 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:22:19.348011 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b6380 0xc0003b63c0]
E0319 13:22:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:22:23.409794 543705 memory.go:184] no items to output this cycle
I0319 13:22:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 13:22:33.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:22:33.409814 543705 memory.go:184] no items to output this cycle
I0319 13:22:33.409825 543705 cpu.go:275] no items to output this cycle
E0319 13:22:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:22:43.409789 543705 memory.go:191] Add success.
I0319 13:22:43.409822 543705 cpu.go:282] Add success.
I0319 13:22:43.419994 543705 net.go:648] Add success.
I0319 13:22:43.422645 543705 net.go:770] primary dev: ETH0
I0319 13:22:43.422658 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:22:43.422671 543705 net.go:698] Add success.
I0319 13:22:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:22:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:22:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:22:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:22:53.409776 543705 memory.go:184] no items to output this cycle
I0319 13:22:53.409805 543705 cpu.go:275] no items to output this cycle
E0319 13:23:03.409923 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:23:03.409946 543705 memory.go:184] no items to output this cycle
I0319 13:23:03.409953 543705 cpu.go:275] no items to output this cycle
E0319 13:23:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:23:13.409801 543705 memory.go:191] Add success.
W0319 13:23:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 13:23:13.409837 543705 cpu.go:282] Add success.
W0319 13:23:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:23:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:23:13.420291 543705 net.go:648] Add success.
I0319 13:23:13.423019 543705 net.go:770] primary dev: ETH0
I0319 13:23:13.423033 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:23:13.423048 543705 net.go:698] Add success.
I0319 13:23:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:23:14.455148 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:23:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0319 13:23:14.455161 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:23:14.456500 543705 disk_worker.go:494] system disk:vda1
I0319 13:23:14.456543 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:23:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:23:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:23:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:23:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:23:16.472401 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:23:19.349674 543705 disk_info.go:125] begin check local disk info of client
I0319 13:23:19.352052 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:23:19.352058 543705 disk_info.go:196] parse disk info done, disk is : [0xc000321500 0xc000321540]
E0319 13:23:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:23:23.409804 543705 memory.go:184] no items to output this cycle
I0319 13:23:23.409825 543705 cpu.go:275] no items to output this cycle
E0319 13:23:33.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:23:33.409816 543705 memory.go:184] no items to output this cycle
I0319 13:23:33.409823 543705 cpu.go:275] no items to output this cycle
E0319 13:23:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:23:43.409797 543705 memory.go:191] Add success.
I0319 13:23:43.409842 543705 cpu.go:282] Add success.
I0319 13:23:43.420101 543705 net.go:648] Add success.
I0319 13:23:43.422899 543705 net.go:770] primary dev: ETH0
I0319 13:23:43.422912 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:23:43.422924 543705 net.go:698] Add success.
I0319 13:23:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:23:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:23:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:23:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:23:53.409767 543705 memory.go:184] no items to output this cycle
I0319 13:23:53.409810 543705 cpu.go:275] no items to output this cycle
E0319 13:24:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:24:03.409780 543705 memory.go:184] no items to output this cycle
I0319 13:24:03.409827 543705 cpu.go:275] no items to output this cycle
E0319 13:24:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:24:13.409822 543705 memory.go:191] Add success.
I0319 13:24:13.409823 543705 cpu.go:282] Add success.
W0319 13:24:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:24:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:24:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:24:13.420130 543705 net.go:648] Add success.
I0319 13:24:13.423124 543705 net.go:770] primary dev: ETH0
I0319 13:24:13.423139 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:24:13.423158 543705 net.go:698] Add success.
I0319 13:24:13.806745 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c1d8d9f1-9e9e-4027-b0ca-b9825152647f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 13:24:13.806783 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 13:24:14.453985 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:24:14.454218 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:24:14.454228 543705 disk_worker.go:708] disk space is not compliant
W0319 13:24:14.454231 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:24:14.455724 543705 disk_worker.go:494] system disk:vda1
I0319 13:24:14.455752 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:24:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:24:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:24:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:24:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:24:16.472409 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:24:19.353672 543705 disk_info.go:125] begin check local disk info of client
I0319 13:24:19.356114 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:24:19.356121 543705 disk_info.go:196] parse disk info done, disk is : [0xc00055dc80 0xc00055dcc0]
E0319 13:24:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:24:23.409766 543705 memory.go:184] no items to output this cycle
I0319 13:24:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 13:24:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:24:33.409789 543705 memory.go:184] no items to output this cycle
I0319 13:24:33.409806 543705 cpu.go:275] no items to output this cycle
I0319 13:24:37.721297 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 13:24:37.721304 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 13:24:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:24:43.410759 543705 memory.go:191] Add success.
I0319 13:24:43.409800 543705 cpu.go:282] Add success.
I0319 13:24:43.420469 543705 net.go:648] Add success.
I0319 13:24:43.423627 543705 net.go:770] primary dev: ETH0
I0319 13:24:43.423640 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:24:43.423653 543705 net.go:698] Add success.
I0319 13:24:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:24:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:24:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:24:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:24:53.409765 543705 memory.go:184] no items to output this cycle
I0319 13:24:53.409799 543705 cpu.go:275] no items to output this cycle
E0319 13:25:03.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:25:03.409812 543705 memory.go:184] no items to output this cycle
I0319 13:25:03.409821 543705 cpu.go:275] no items to output this cycle
E0319 13:25:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:25:13.409816 543705 memory.go:191] Add success.
I0319 13:25:13.409820 543705 cpu.go:282] Add success.
W0319 13:25:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:25:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:25:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:25:13.420152 543705 net.go:648] Add success.
I0319 13:25:13.423225 543705 net.go:770] primary dev: ETH0
I0319 13:25:13.423240 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:25:13.423252 543705 net.go:698] Add success.
I0319 13:25:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:25:14.455102 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:25:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0319 13:25:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:25:14.456604 543705 disk_worker.go:494] system disk:vda1
I0319 13:25:14.456638 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:25:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:25:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:25:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:25:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:25:16.472404 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:25:19.357672 543705 disk_info.go:125] begin check local disk info of client
I0319 13:25:19.360079 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:25:19.360086 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034c280 0xc00034c2c0]
E0319 13:25:23.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:25:23.409760 543705 memory.go:184] no items to output this cycle
I0319 13:25:23.409796 543705 cpu.go:275] no items to output this cycle
E0319 13:25:33.409848 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:25:33.409867 543705 memory.go:184] no items to output this cycle
I0319 13:25:33.409972 543705 cpu.go:275] no items to output this cycle
E0319 13:25:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:25:43.409807 543705 memory.go:191] Add success.
I0319 13:25:43.409809 543705 cpu.go:282] Add success.
I0319 13:25:43.419807 543705 net.go:770] primary dev: ETH0
I0319 13:25:43.419822 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:25:43.419836 543705 net.go:698] Add success.
I0319 13:25:43.420218 543705 net.go:648] Add success.
I0319 13:25:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:25:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:25:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:25:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:25:53.409766 543705 memory.go:184] no items to output this cycle
I0319 13:25:53.409799 543705 cpu.go:275] no items to output this cycle
E0319 13:26:03.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:26:03.409805 543705 cpu.go:275] no items to output this cycle
I0319 13:26:03.409817 543705 memory.go:184] no items to output this cycle
E0319 13:26:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:26:13.409817 543705 memory.go:191] Add success.
I0319 13:26:13.409829 543705 cpu.go:282] Add success.
W0319 13:26:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:26:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:26:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:26:13.420131 543705 net.go:648] Add success.
I0319 13:26:13.422961 543705 net.go:770] primary dev: ETH0
I0319 13:26:13.422977 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:26:13.422990 543705 net.go:698] Add success.
I0319 13:26:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:26:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:26:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0319 13:26:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:26:14.456690 543705 disk_worker.go:494] system disk:vda1
I0319 13:26:14.456718 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:26:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:26:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:26:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:26:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:26:16.472397 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:26:19.361674 543705 disk_info.go:125] begin check local disk info of client
I0319 13:26:19.364044 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:26:19.364049 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000df800 0xc0000df840]
E0319 13:26:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:26:23.409793 543705 memory.go:184] no items to output this cycle
I0319 13:26:23.409806 543705 cpu.go:275] no items to output this cycle
E0319 13:26:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:26:33.409784 543705 memory.go:184] no items to output this cycle
I0319 13:26:33.409791 543705 cpu.go:275] no items to output this cycle
E0319 13:26:43.409852 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:26:43.409889 543705 memory.go:191] Add success.
I0319 13:26:43.409952 543705 cpu.go:282] Add success.
I0319 13:26:43.419730 543705 net.go:648] Add success.
I0319 13:26:43.422556 543705 net.go:770] primary dev: ETH0
I0319 13:26:43.422569 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:26:43.422581 543705 net.go:698] Add success.
I0319 13:26:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:26:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:26:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:26:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:26:53.409800 543705 memory.go:184] no items to output this cycle
I0319 13:26:53.409810 543705 cpu.go:275] no items to output this cycle
E0319 13:27:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:27:03.409782 543705 memory.go:184] no items to output this cycle
I0319 13:27:03.409821 543705 cpu.go:275] no items to output this cycle
W0319 13:27:13.409701 543705 conf_downlod.go:84] cant't download conf: Get "": unsupported protocol scheme ""
W0319 13:27:13.409739 543705 conf_downlod.go:89] use old conf
E0319 13:27:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:27:13.409795 543705 cpu.go:282] Add success.
I0319 13:27:13.409801 543705 memory.go:191] Add success.
W0319 13:27:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:27:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:27:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:27:13.420156 543705 net.go:648] Add success.
I0319 13:27:13.422767 543705 net.go:770] primary dev: ETH0
I0319 13:27:13.422781 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:27:13.422793 543705 net.go:698] Add success.
I0319 13:27:13.429092 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 13:27:13.453266 543705 event_worker.go:152] Polling the log file for events...
I0319 13:27:14.247417 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a7882db4-f173-4857-b9f6-b6edfb78ce02","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 13:27:14.247454 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 13:27:14.454240 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:27:14.454251 543705 disk_worker.go:708] disk space is not compliant
W0319 13:27:14.454253 543705 disk_worker.go:728] disk inode is not compliant
E0319 13:27:14.455826 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 13:27:14.455846 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 13:27:14.455853 543705 custom_config.go:64] query custom config with name: gpu
I0319 13:27:14.456043 543705 disk_worker.go:494] system disk:vda1
I0319 13:27:14.456083 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 13:27:15.456454 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 13:27:15.456465 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:27:16.457963 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 13:27:16.457972 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 13:27:16.458016 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:27:16.458044 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:27:16.472369 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:27:19.365671 543705 disk_info.go:125] begin check local disk info of client
I0319 13:27:19.368088 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:27:19.368094 543705 disk_info.go:196] parse disk info done, disk is : [0xc000328b00 0xc000328b40]
E0319 13:27:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:27:23.409768 543705 memory.go:184] no items to output this cycle
I0319 13:27:23.409775 543705 cpu.go:275] no items to output this cycle
E0319 13:27:33.409868 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:27:33.409891 543705 memory.go:184] no items to output this cycle
I0319 13:27:33.409976 543705 cpu.go:275] no items to output this cycle
I0319 13:27:37.721728 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 13:27:37.721734 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 13:27:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:27:43.410658 543705 memory.go:191] Add success.
I0319 13:27:43.409801 543705 cpu.go:282] Add success.
I0319 13:27:43.420339 543705 net.go:648] Add success.
I0319 13:27:43.422939 543705 net.go:770] primary dev: ETH0
I0319 13:27:43.422953 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:27:43.422966 543705 net.go:698] Add success.
I0319 13:27:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:27:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:27:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:27:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:27:53.409774 543705 memory.go:184] no items to output this cycle
I0319 13:27:53.409778 543705 cpu.go:275] no items to output this cycle
E0319 13:28:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:28:03.409795 543705 memory.go:184] no items to output this cycle
I0319 13:28:03.409803 543705 cpu.go:275] no items to output this cycle
E0319 13:28:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:28:13.409820 543705 memory.go:191] Add success.
I0319 13:28:13.409831 543705 cpu.go:282] Add success.
W0319 13:28:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:28:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:28:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:28:13.420122 543705 net.go:648] Add success.
I0319 13:28:13.423019 543705 net.go:770] primary dev: ETH0
I0319 13:28:13.423032 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:28:13.423044 543705 net.go:698] Add success.
I0319 13:28:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:28:14.455119 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:28:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0319 13:28:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:28:14.456504 543705 disk_worker.go:494] system disk:vda1
I0319 13:28:14.456547 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:28:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:28:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:28:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:28:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:28:16.472414 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:28:19.369673 543705 disk_info.go:125] begin check local disk info of client
I0319 13:28:19.372104 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:28:19.372110 543705 disk_info.go:196] parse disk info done, disk is : [0xc000364a40 0xc000364a80]
E0319 13:28:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:28:23.409762 543705 memory.go:184] no items to output this cycle
I0319 13:28:23.409797 543705 cpu.go:275] no items to output this cycle
E0319 13:28:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:28:33.409857 543705 memory.go:184] no items to output this cycle
I0319 13:28:33.409905 543705 cpu.go:275] no items to output this cycle
E0319 13:28:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:28:43.409804 543705 memory.go:191] Add success.
I0319 13:28:43.409807 543705 cpu.go:282] Add success.
I0319 13:28:43.419855 543705 net.go:648] Add success.
I0319 13:28:43.422673 543705 net.go:770] primary dev: ETH0
I0319 13:28:43.422688 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:28:43.422703 543705 net.go:698] Add success.
I0319 13:28:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:28:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:28:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:28:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:28:53.409767 543705 memory.go:184] no items to output this cycle
I0319 13:28:53.409792 543705 cpu.go:275] no items to output this cycle
E0319 13:29:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:29:03.409784 543705 memory.go:184] no items to output this cycle
I0319 13:29:03.409834 543705 cpu.go:275] no items to output this cycle
E0319 13:29:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:29:13.409796 543705 memory.go:191] Add success.
I0319 13:29:13.409799 543705 cpu.go:282] Add success.
W0319 13:29:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:29:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:29:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:29:13.420113 543705 net.go:648] Add success.
I0319 13:29:13.423035 543705 net.go:770] primary dev: ETH0
I0319 13:29:13.423048 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:29:13.423061 543705 net.go:698] Add success.
I0319 13:29:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:29:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:29:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0319 13:29:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:29:14.456577 543705 disk_worker.go:494] system disk:vda1
I0319 13:29:14.456608 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:29:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:29:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:29:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:29:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:29:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:29:19.373671 543705 disk_info.go:125] begin check local disk info of client
I0319 13:29:19.376089 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:29:19.376095 543705 disk_info.go:196] parse disk info done, disk is : [0xc000314300 0xc000314340]
E0319 13:29:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:29:23.409798 543705 memory.go:184] no items to output this cycle
I0319 13:29:23.409812 543705 cpu.go:275] no items to output this cycle
E0319 13:29:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:29:33.409769 543705 memory.go:184] no items to output this cycle
I0319 13:29:33.409790 543705 cpu.go:275] no items to output this cycle
E0319 13:29:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:29:43.409828 543705 memory.go:191] Add success.
I0319 13:29:43.409835 543705 cpu.go:282] Add success.
I0319 13:29:43.419979 543705 net.go:648] Add success.
I0319 13:29:43.422799 543705 net.go:770] primary dev: ETH0
I0319 13:29:43.422814 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:29:43.422828 543705 net.go:698] Add success.
I0319 13:29:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:29:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:29:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:29:53.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:29:53.409763 543705 memory.go:184] no items to output this cycle
I0319 13:29:53.409792 543705 cpu.go:275] no items to output this cycle
E0319 13:30:03.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:30:03.409815 543705 memory.go:184] no items to output this cycle
I0319 13:30:03.409827 543705 cpu.go:275] no items to output this cycle
E0319 13:30:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:30:13.409816 543705 memory.go:191] Add success.
I0319 13:30:13.409824 543705 cpu.go:282] Add success.
W0319 13:30:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:30:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:30:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:30:13.420261 543705 net.go:648] Add success.
I0319 13:30:13.422777 543705 net.go:770] primary dev: ETH0
I0319 13:30:13.422791 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:30:13.422805 543705 net.go:698] Add success.
I0319 13:30:13.468665 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ee7c7ab1-fddf-4bef-b1d0-cbf2da82fccd","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 13:30:13.468706 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 13:30:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:30:14.455194 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:30:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0319 13:30:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:30:14.456638 543705 disk_worker.go:494] system disk:vda1
I0319 13:30:14.456670 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:30:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:30:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:30:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:30:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:30:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:30:19.377673 543705 disk_info.go:125] begin check local disk info of client
I0319 13:30:19.380045 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:30:19.380052 543705 disk_info.go:196] parse disk info done, disk is : [0xc000287480 0xc0002874c0]
E0319 13:30:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:30:23.409799 543705 memory.go:184] no items to output this cycle
I0319 13:30:23.409810 543705 cpu.go:275] no items to output this cycle
E0319 13:30:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:30:33.409776 543705 memory.go:184] no items to output this cycle
I0319 13:30:33.409797 543705 cpu.go:275] no items to output this cycle
I0319 13:30:37.723306 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 13:30:37.723313 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 13:30:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:30:43.410628 543705 memory.go:191] Add success.
I0319 13:30:43.409922 543705 cpu.go:282] Add success.
I0319 13:30:43.419731 543705 net.go:648] Add success.
I0319 13:30:43.422450 543705 net.go:770] primary dev: ETH0
I0319 13:30:43.422464 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:30:43.422477 543705 net.go:698] Add success.
I0319 13:30:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:30:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:30:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:30:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:30:53.409798 543705 memory.go:184] no items to output this cycle
I0319 13:30:53.409808 543705 cpu.go:275] no items to output this cycle
E0319 13:31:03.409815 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:31:03.409825 543705 cpu.go:275] no items to output this cycle
I0319 13:31:03.409836 543705 memory.go:184] no items to output this cycle
E0319 13:31:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:31:13.409794 543705 memory.go:191] Add success.
I0319 13:31:13.409798 543705 cpu.go:282] Add success.
W0319 13:31:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:31:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:31:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:31:13.420371 543705 net.go:648] Add success.
I0319 13:31:13.423144 543705 net.go:770] primary dev: ETH0
I0319 13:31:13.423172 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:31:13.423183 543705 net.go:698] Add success.
I0319 13:31:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:31:14.455198 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:31:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0319 13:31:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:31:14.456599 543705 disk_worker.go:494] system disk:vda1
I0319 13:31:14.456628 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:31:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:31:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:31:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:31:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:31:16.472402 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:31:19.381669 543705 disk_info.go:125] begin check local disk info of client
I0319 13:31:19.384082 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:31:19.384088 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aac80 0xc0001aacc0]
E0319 13:31:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:31:23.409776 543705 memory.go:184] no items to output this cycle
I0319 13:31:23.409779 543705 cpu.go:275] no items to output this cycle
E0319 13:31:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:31:33.409802 543705 memory.go:184] no items to output this cycle
I0319 13:31:33.409813 543705 cpu.go:275] no items to output this cycle
E0319 13:31:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:31:43.409795 543705 memory.go:191] Add success.
I0319 13:31:43.409795 543705 cpu.go:282] Add success.
I0319 13:31:43.419827 543705 net.go:648] Add success.
I0319 13:31:43.422768 543705 net.go:770] primary dev: ETH0
I0319 13:31:43.422781 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:31:43.422793 543705 net.go:698] Add success.
I0319 13:31:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:31:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:31:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:31:53.409858 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:31:53.409877 543705 memory.go:184] no items to output this cycle
I0319 13:31:53.409987 543705 cpu.go:275] no items to output this cycle
E0319 13:32:03.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:32:03.409814 543705 memory.go:184] no items to output this cycle
I0319 13:32:03.409824 543705 cpu.go:275] no items to output this cycle
E0319 13:32:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:32:13.409795 543705 memory.go:191] Add success.
I0319 13:32:13.409796 543705 cpu.go:282] Add success.
W0319 13:32:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:32:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:32:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:32:13.420398 543705 net.go:648] Add success.
I0319 13:32:13.423342 543705 net.go:770] primary dev: ETH0
I0319 13:32:13.423358 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:32:13.423372 543705 net.go:698] Add success.
W0319 13:32:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:32:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0319 13:32:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:32:14.456767 543705 disk_worker.go:494] system disk:vda1
I0319 13:32:14.456809 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 13:32:14.457161 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 13:32:14.457168 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 13:32:14.457173 543705 custom_config.go:64] query custom config with name: gpu
E0319 13:32:15.456845 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 13:32:15.456854 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:32:16.457953 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 13:32:16.457963 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 13:32:16.458006 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:32:16.458026 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:32:16.472332 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:32:19.385673 543705 disk_info.go:125] begin check local disk info of client
I0319 13:32:19.388018 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:32:19.388024 543705 disk_info.go:196] parse disk info done, disk is : [0xc000356b40 0xc000356b80]
E0319 13:32:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:32:23.409791 543705 memory.go:184] no items to output this cycle
I0319 13:32:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 13:32:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:32:33.409786 543705 memory.go:184] no items to output this cycle
I0319 13:32:33.409793 543705 cpu.go:275] no items to output this cycle
E0319 13:32:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:32:43.409794 543705 memory.go:191] Add success.
I0319 13:32:43.409797 543705 cpu.go:282] Add success.
I0319 13:32:43.420013 543705 net.go:648] Add success.
I0319 13:32:43.422767 543705 net.go:770] primary dev: ETH0
I0319 13:32:43.422780 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:32:43.422793 543705 net.go:698] Add success.
I0319 13:32:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:32:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:32:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:32:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:32:53.409766 543705 memory.go:184] no items to output this cycle
I0319 13:32:53.409805 543705 cpu.go:275] no items to output this cycle
E0319 13:33:03.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:33:03.409814 543705 memory.go:184] no items to output this cycle
I0319 13:33:03.409827 543705 cpu.go:275] no items to output this cycle
E0319 13:33:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:33:13.409808 543705 memory.go:191] Add success.
I0319 13:33:13.409810 543705 cpu.go:282] Add success.
W0319 13:33:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:33:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:33:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:33:13.420187 543705 net.go:648] Add success.
I0319 13:33:13.422818 543705 net.go:770] primary dev: ETH0
I0319 13:33:13.422831 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:33:13.422843 543705 net.go:698] Add success.
I0319 13:33:13.470030 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f75e6234-2039-4b5e-973b-258c239348b6","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 13:33:13.470063 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 13:33:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:33:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:33:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0319 13:33:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:33:14.456604 543705 disk_worker.go:494] system disk:vda1
I0319 13:33:14.456635 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:33:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:33:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:33:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:33:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:33:16.472371 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:33:19.392061 543705 disk_info.go:125] begin check local disk info of client
I0319 13:33:19.394467 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:33:19.394473 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bda00 0xc0002bda40]
E0319 13:33:23.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:33:23.409760 543705 memory.go:184] no items to output this cycle
I0319 13:33:23.409793 543705 cpu.go:275] no items to output this cycle
E0319 13:33:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:33:33.409803 543705 memory.go:184] no items to output this cycle
I0319 13:33:33.409819 543705 cpu.go:275] no items to output this cycle
I0319 13:33:37.723451 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 13:33:37.723458 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 13:33:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:33:43.410732 543705 memory.go:191] Add success.
I0319 13:33:43.409800 543705 cpu.go:282] Add success.
I0319 13:33:43.420430 543705 net.go:648] Add success.
I0319 13:33:43.423048 543705 net.go:770] primary dev: ETH0
I0319 13:33:43.423060 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:33:43.423073 543705 net.go:698] Add success.
I0319 13:33:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:33:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:33:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:33:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:33:53.409774 543705 memory.go:184] no items to output this cycle
I0319 13:33:53.409810 543705 cpu.go:275] no items to output this cycle
I0319 13:34:03.409802 543705 cpu.go:275] no items to output this cycle
E0319 13:34:03.409803 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:34:03.409823 543705 memory.go:184] no items to output this cycle
E0319 13:34:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:34:13.409796 543705 memory.go:191] Add success.
W0319 13:34:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 13:34:13.409825 543705 cpu.go:282] Add success.
W0319 13:34:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:34:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:34:13.420312 543705 net.go:648] Add success.
I0319 13:34:13.423386 543705 net.go:770] primary dev: ETH0
I0319 13:34:13.423399 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:34:13.423410 543705 net.go:698] Add success.
I0319 13:34:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:34:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:34:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0319 13:34:14.455176 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:34:14.456493 543705 disk_worker.go:494] system disk:vda1
I0319 13:34:14.456537 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:34:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:34:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:34:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:34:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:34:16.472412 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:34:19.397672 543705 disk_info.go:125] begin check local disk info of client
I0319 13:34:19.400051 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:34:19.400057 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007bac0 0xc00007bb00]
E0319 13:34:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:34:23.409789 543705 memory.go:184] no items to output this cycle
I0319 13:34:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 13:34:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:34:33.409776 543705 memory.go:184] no items to output this cycle
I0319 13:34:33.409812 543705 cpu.go:275] no items to output this cycle
E0319 13:34:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:34:43.409784 543705 memory.go:191] Add success.
I0319 13:34:43.409794 543705 cpu.go:282] Add success.
I0319 13:34:43.419906 543705 net.go:648] Add success.
I0319 13:34:43.422543 543705 net.go:770] primary dev: ETH0
I0319 13:34:43.422559 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:34:43.422572 543705 net.go:698] Add success.
I0319 13:34:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:34:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:34:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:34:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:34:53.409781 543705 cpu.go:275] no items to output this cycle
I0319 13:34:53.409782 543705 memory.go:184] no items to output this cycle
I0319 13:35:03.409791 543705 cpu.go:275] no items to output this cycle
E0319 13:35:03.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:35:03.409812 543705 memory.go:184] no items to output this cycle
E0319 13:35:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:35:13.409937 543705 cpu.go:282] Add success.
I0319 13:35:13.409962 543705 memory.go:191] Add success.
W0319 13:35:13.410019 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:35:13.410044 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:35:13.410049 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:35:13.419760 543705 net.go:648] Add success.
I0319 13:35:13.422626 543705 net.go:770] primary dev: ETH0
I0319 13:35:13.422641 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:35:13.422654 543705 net.go:698] Add success.
I0319 13:35:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:35:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:35:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0319 13:35:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:35:14.456589 543705 disk_worker.go:494] system disk:vda1
I0319 13:35:14.456618 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:35:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:35:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:35:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:35:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:35:16.472407 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:35:19.401673 543705 disk_info.go:125] begin check local disk info of client
I0319 13:35:19.404036 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:35:19.404042 543705 disk_info.go:196] parse disk info done, disk is : [0xc000264600 0xc000264640]
E0319 13:35:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:35:23.409792 543705 memory.go:184] no items to output this cycle
I0319 13:35:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 13:35:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:35:33.409782 543705 memory.go:184] no items to output this cycle
I0319 13:35:33.409788 543705 cpu.go:275] no items to output this cycle
E0319 13:35:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:35:43.409816 543705 memory.go:191] Add success.
I0319 13:35:43.409825 543705 cpu.go:282] Add success.
I0319 13:35:43.419870 543705 net.go:648] Add success.
I0319 13:35:43.422725 543705 net.go:770] primary dev: ETH0
I0319 13:35:43.422738 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:35:43.422751 543705 net.go:698] Add success.
I0319 13:35:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:35:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:35:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:35:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:35:53.409803 543705 memory.go:184] no items to output this cycle
I0319 13:35:53.409814 543705 cpu.go:275] no items to output this cycle
E0319 13:36:03.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:36:03.409759 543705 memory.go:184] no items to output this cycle
I0319 13:36:03.409837 543705 cpu.go:275] no items to output this cycle
E0319 13:36:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:36:13.409820 543705 memory.go:191] Add success.
I0319 13:36:13.409832 543705 cpu.go:282] Add success.
W0319 13:36:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:36:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:36:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:36:13.419723 543705 net.go:648] Add success.
I0319 13:36:13.422578 543705 net.go:770] primary dev: ETH0
I0319 13:36:13.422591 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:36:13.422604 543705 net.go:698] Add success.
I0319 13:36:13.481360 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e2985640-baad-4dbc-85c9-d245ae8d49a5","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 13:36:13.481391 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 13:36:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:36:14.455104 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:36:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0319 13:36:14.455171 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:36:14.456539 543705 disk_worker.go:494] system disk:vda1
I0319 13:36:14.456564 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:36:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:36:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:36:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:36:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:36:16.472393 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:36:19.405677 543705 disk_info.go:125] begin check local disk info of client
I0319 13:36:19.408102 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:36:19.408108 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa480 0xc0001aa4c0]
E0319 13:36:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:36:23.409795 543705 memory.go:184] no items to output this cycle
I0319 13:36:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 13:36:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:36:33.409785 543705 memory.go:184] no items to output this cycle
I0319 13:36:33.409810 543705 cpu.go:275] no items to output this cycle
I0319 13:36:37.723599 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 13:36:37.723606 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 13:36:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:36:43.410719 543705 memory.go:191] Add success.
I0319 13:36:43.409822 543705 cpu.go:282] Add success.
I0319 13:36:43.420424 543705 net.go:648] Add success.
I0319 13:36:43.423374 543705 net.go:770] primary dev: ETH0
I0319 13:36:43.423388 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:36:43.423407 543705 net.go:698] Add success.
I0319 13:36:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:36:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:36:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:36:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:36:53.409792 543705 cpu.go:275] no items to output this cycle
I0319 13:36:53.409799 543705 memory.go:184] no items to output this cycle
E0319 13:37:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:37:03.409781 543705 memory.go:184] no items to output this cycle
I0319 13:37:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 13:37:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:37:13.409792 543705 memory.go:191] Add success.
W0319 13:37:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 13:37:13.409817 543705 cpu.go:282] Add success.
W0319 13:37:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:37:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:37:13.420242 543705 net.go:648] Add success.
I0319 13:37:13.423513 543705 net.go:770] primary dev: ETH0
I0319 13:37:13.423528 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:37:13.423541 543705 net.go:698] Add success.
I0319 13:37:13.452777 543705 event_worker.go:152] Polling the log file for events...
W0319 13:37:14.455101 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:37:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0319 13:37:14.455165 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:37:14.456794 543705 disk_worker.go:494] system disk:vda1
I0319 13:37:14.456830 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 13:37:14.457045 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 13:37:14.457053 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 13:37:14.457058 543705 custom_config.go:64] query custom config with name: gpu
E0319 13:37:15.456885 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 13:37:15.456894 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:37:16.457933 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 13:37:16.457943 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 13:37:16.457987 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:37:16.458004 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:37:16.472348 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:37:19.409672 543705 disk_info.go:125] begin check local disk info of client
I0319 13:37:19.412004 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:37:19.412010 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e1d40 0xc0003e1d80]
E0319 13:37:23.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:37:23.409812 543705 memory.go:184] no items to output this cycle
I0319 13:37:23.409823 543705 cpu.go:275] no items to output this cycle
E0319 13:37:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:37:33.409771 543705 memory.go:184] no items to output this cycle
I0319 13:37:33.409799 543705 cpu.go:275] no items to output this cycle
E0319 13:37:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:37:43.409818 543705 memory.go:191] Add success.
I0319 13:37:43.409826 543705 cpu.go:282] Add success.
I0319 13:37:43.419957 543705 net.go:648] Add success.
I0319 13:37:43.422996 543705 net.go:770] primary dev: ETH0
I0319 13:37:43.423010 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:37:43.423024 543705 net.go:698] Add success.
I0319 13:37:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:37:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:37:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:37:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:37:53.409778 543705 memory.go:184] no items to output this cycle
I0319 13:37:53.409806 543705 cpu.go:275] no items to output this cycle
E0319 13:38:03.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:38:03.409758 543705 memory.go:184] no items to output this cycle
I0319 13:38:03.409799 543705 cpu.go:275] no items to output this cycle
E0319 13:38:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:38:13.409826 543705 memory.go:191] Add success.
I0319 13:38:13.409841 543705 cpu.go:282] Add success.
W0319 13:38:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:38:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:38:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:38:13.420397 543705 net.go:648] Add success.
I0319 13:38:13.423219 543705 net.go:770] primary dev: ETH0
I0319 13:38:13.423232 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:38:13.423244 543705 net.go:698] Add success.
I0319 13:38:14.453981 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:38:14.454129 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:38:14.454192 543705 disk_worker.go:708] disk space is not compliant
W0319 13:38:14.454195 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:38:14.455506 543705 disk_worker.go:494] system disk:vda1
I0319 13:38:14.455548 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:38:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:38:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:38:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:38:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:38:16.472435 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:38:19.412792 543705 disk_info.go:125] begin check local disk info of client
I0319 13:38:19.415164 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:38:19.415170 543705 disk_info.go:196] parse disk info done, disk is : [0xc000587880 0xc0005878c0]
E0319 13:38:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:38:23.409799 543705 memory.go:184] no items to output this cycle
I0319 13:38:23.409812 543705 cpu.go:275] no items to output this cycle
E0319 13:38:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:38:33.409788 543705 memory.go:184] no items to output this cycle
I0319 13:38:33.409818 543705 cpu.go:275] no items to output this cycle
E0319 13:38:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:38:43.409799 543705 memory.go:191] Add success.
I0319 13:38:43.409805 543705 cpu.go:282] Add success.
I0319 13:38:43.419901 543705 net.go:648] Add success.
I0319 13:38:43.422818 543705 net.go:770] primary dev: ETH0
I0319 13:38:43.422835 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:38:43.422847 543705 net.go:698] Add success.
I0319 13:38:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:38:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:38:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:38:53.410240 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:38:53.410262 543705 memory.go:184] no items to output this cycle
I0319 13:38:53.410266 543705 cpu.go:275] no items to output this cycle
E0319 13:39:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:39:03.409804 543705 memory.go:184] no items to output this cycle
I0319 13:39:03.409817 543705 cpu.go:275] no items to output this cycle
E0319 13:39:13.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:39:13.409830 543705 memory.go:191] Add success.
I0319 13:39:13.409837 543705 cpu.go:282] Add success.
W0319 13:39:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:39:13.409878 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:39:13.409882 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:39:13.420124 543705 net.go:648] Add success.
I0319 13:39:13.422982 543705 net.go:770] primary dev: ETH0
I0319 13:39:13.422995 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:39:13.423007 543705 net.go:698] Add success.
I0319 13:39:13.469117 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"18902144-eef8-4d0d-b835-fe78cb31df5c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 13:39:13.469150 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 13:39:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:39:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:39:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0319 13:39:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:39:14.458934 543705 disk_worker.go:494] system disk:vda1
I0319 13:39:14.458963 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:39:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:39:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:39:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:39:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:39:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:39:19.415795 543705 disk_info.go:125] begin check local disk info of client
I0319 13:39:19.418171 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:39:19.418177 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003766c0 0xc000376700]
E0319 13:39:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:39:23.409806 543705 memory.go:184] no items to output this cycle
I0319 13:39:23.409817 543705 cpu.go:275] no items to output this cycle
E0319 13:39:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:39:33.409786 543705 memory.go:184] no items to output this cycle
I0319 13:39:33.409800 543705 cpu.go:275] no items to output this cycle
I0319 13:39:37.724313 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 13:39:37.724319 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 13:39:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:39:43.410716 543705 memory.go:191] Add success.
I0319 13:39:43.409827 543705 cpu.go:282] Add success.
I0319 13:39:43.420450 543705 net.go:648] Add success.
I0319 13:39:43.423138 543705 net.go:770] primary dev: ETH0
I0319 13:39:43.423151 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:39:43.423164 543705 net.go:698] Add success.
I0319 13:39:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:39:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:39:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:39:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:39:53.409782 543705 memory.go:184] no items to output this cycle
I0319 13:39:53.409809 543705 cpu.go:275] no items to output this cycle
E0319 13:40:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:40:03.409784 543705 memory.go:184] no items to output this cycle
I0319 13:40:03.409796 543705 cpu.go:275] no items to output this cycle
E0319 13:40:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:40:13.409807 543705 memory.go:191] Add success.
I0319 13:40:13.409807 543705 cpu.go:282] Add success.
W0319 13:40:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:40:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:40:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:40:13.420217 543705 net.go:648] Add success.
I0319 13:40:13.422981 543705 net.go:770] primary dev: ETH0
I0319 13:40:13.422994 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:40:13.423007 543705 net.go:698] Add success.
I0319 13:40:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:40:14.455199 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:40:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0319 13:40:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:40:14.457642 543705 disk_worker.go:494] system disk:vda1
I0319 13:40:14.457698 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:40:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:40:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:40:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:40:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:40:16.472379 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:40:19.418803 543705 disk_info.go:125] begin check local disk info of client
I0319 13:40:19.421196 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:40:19.421204 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab500 0xc0001ab540]
E0319 13:40:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:40:23.409764 543705 memory.go:184] no items to output this cycle
I0319 13:40:23.409793 543705 cpu.go:275] no items to output this cycle
E0319 13:40:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:40:33.409799 543705 memory.go:184] no items to output this cycle
I0319 13:40:33.409803 543705 cpu.go:275] no items to output this cycle
E0319 13:40:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:40:43.409795 543705 memory.go:191] Add success.
I0319 13:40:43.409808 543705 cpu.go:282] Add success.
I0319 13:40:43.419879 543705 net.go:648] Add success.
I0319 13:40:43.422848 543705 net.go:770] primary dev: ETH0
I0319 13:40:43.422867 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:40:43.422889 543705 net.go:698] Add success.
I0319 13:40:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:40:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:40:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:40:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:40:53.409770 543705 memory.go:184] no items to output this cycle
I0319 13:40:53.409809 543705 cpu.go:275] no items to output this cycle
E0319 13:41:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:41:03.409797 543705 memory.go:184] no items to output this cycle
I0319 13:41:03.409816 543705 cpu.go:275] no items to output this cycle
E0319 13:41:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:41:13.409822 543705 memory.go:191] Add success.
I0319 13:41:13.409829 543705 cpu.go:282] Add success.
W0319 13:41:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:41:13.409883 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:41:13.409887 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:41:13.420147 543705 net.go:648] Add success.
I0319 13:41:13.422950 543705 net.go:770] primary dev: ETH0
I0319 13:41:13.422964 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:41:13.422976 543705 net.go:698] Add success.
I0319 13:41:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:41:14.455164 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:41:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0319 13:41:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:41:14.456501 543705 disk_worker.go:494] system disk:vda1
I0319 13:41:14.456548 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:41:15.454980 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:41:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:41:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:41:16.458049 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:41:16.472386 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:41:19.421800 543705 disk_info.go:125] begin check local disk info of client
I0319 13:41:19.424221 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:41:19.424227 543705 disk_info.go:196] parse disk info done, disk is : [0xc000380400 0xc000380440]
E0319 13:41:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:41:23.409769 543705 memory.go:184] no items to output this cycle
I0319 13:41:23.409776 543705 cpu.go:275] no items to output this cycle
E0319 13:41:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:41:33.409798 543705 memory.go:184] no items to output this cycle
I0319 13:41:33.409813 543705 cpu.go:275] no items to output this cycle
E0319 13:41:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:41:43.409812 543705 memory.go:191] Add success.
I0319 13:41:43.409817 543705 cpu.go:282] Add success.
I0319 13:41:43.419856 543705 net.go:648] Add success.
I0319 13:41:43.422765 543705 net.go:770] primary dev: ETH0
I0319 13:41:43.422778 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:41:43.422791 543705 net.go:698] Add success.
I0319 13:41:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:41:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:41:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:41:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:41:53.409799 543705 memory.go:184] no items to output this cycle
I0319 13:41:53.409813 543705 cpu.go:275] no items to output this cycle
E0319 13:42:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:42:03.409771 543705 memory.go:184] no items to output this cycle
I0319 13:42:03.409792 543705 cpu.go:275] no items to output this cycle
E0319 13:42:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:42:13.409810 543705 memory.go:191] Add success.
I0319 13:42:13.409810 543705 cpu.go:282] Add success.
W0319 13:42:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:42:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:42:13.409855 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:42:13.420150 543705 net.go:648] Add success.
I0319 13:42:13.422845 543705 net.go:770] primary dev: ETH0
I0319 13:42:13.422860 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:42:13.422874 543705 net.go:698] Add success.
I0319 13:42:13.468912 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e83beed2-c5dc-4058-98d1-19b4c7a2ec2f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 13:42:13.468960 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 13:42:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:42:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0319 13:42:14.455189 543705 disk_worker.go:728] disk inode is not compliant
E0319 13:42:14.456805 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 13:42:14.456814 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 13:42:14.456820 543705 custom_config.go:64] query custom config with name: gpu
I0319 13:42:14.456838 543705 disk_worker.go:494] system disk:vda1
I0319 13:42:14.456867 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 13:42:15.456836 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 13:42:15.456845 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:42:16.457999 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 13:42:16.458011 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 13:42:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:42:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:42:16.472424 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:42:19.424819 543705 disk_info.go:125] begin check local disk info of client
I0319 13:42:19.427171 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:42:19.427179 543705 disk_info.go:196] parse disk info done, disk is : [0xc0005140c0 0xc000514100]
E0319 13:42:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:42:23.409775 543705 memory.go:184] no items to output this cycle
I0319 13:42:23.409776 543705 cpu.go:275] no items to output this cycle
E0319 13:42:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:42:33.409781 543705 memory.go:184] no items to output this cycle
I0319 13:42:33.409822 543705 cpu.go:275] no items to output this cycle
I0319 13:42:37.724456 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 13:42:37.724464 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 13:42:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:42:43.409822 543705 cpu.go:282] Add success.
I0319 13:42:43.410748 543705 memory.go:191] Add success.
I0319 13:42:43.420519 543705 net.go:648] Add success.
I0319 13:42:43.423631 543705 net.go:770] primary dev: ETH0
I0319 13:42:43.423648 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:42:43.423663 543705 net.go:698] Add success.
I0319 13:42:46.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:42:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:42:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:42:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:42:53.409804 543705 memory.go:184] no items to output this cycle
I0319 13:42:53.409816 543705 cpu.go:275] no items to output this cycle
E0319 13:43:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:43:03.409779 543705 memory.go:184] no items to output this cycle
I0319 13:43:03.409781 543705 cpu.go:275] no items to output this cycle
E0319 13:43:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:43:13.409820 543705 memory.go:191] Add success.
I0319 13:43:13.409830 543705 cpu.go:282] Add success.
W0319 13:43:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:43:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:43:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:43:13.420260 543705 net.go:648] Add success.
I0319 13:43:13.423053 543705 net.go:770] primary dev: ETH0
I0319 13:43:13.423071 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:43:13.423087 543705 net.go:698] Add success.
I0319 13:43:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:43:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:43:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0319 13:43:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:43:14.456501 543705 disk_worker.go:494] system disk:vda1
I0319 13:43:14.456544 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:43:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:43:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:43:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:43:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:43:16.472454 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:43:19.427831 543705 disk_info.go:125] begin check local disk info of client
I0319 13:43:19.430273 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:43:19.430279 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007a7c0 0xc00007a940]
E0319 13:43:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:43:23.409791 543705 memory.go:184] no items to output this cycle
I0319 13:43:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 13:43:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:43:33.409776 543705 memory.go:184] no items to output this cycle
I0319 13:43:33.409780 543705 cpu.go:275] no items to output this cycle
E0319 13:43:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:43:43.409797 543705 memory.go:191] Add success.
I0319 13:43:43.409802 543705 cpu.go:282] Add success.
I0319 13:43:43.419874 543705 net.go:648] Add success.
I0319 13:43:43.422869 543705 net.go:770] primary dev: ETH0
I0319 13:43:43.422882 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:43:43.422895 543705 net.go:698] Add success.
I0319 13:43:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:43:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:43:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:43:53.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:43:53.409808 543705 memory.go:184] no items to output this cycle
I0319 13:43:53.409810 543705 cpu.go:275] no items to output this cycle
E0319 13:44:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:44:03.409783 543705 memory.go:184] no items to output this cycle
I0319 13:44:03.409782 543705 cpu.go:275] no items to output this cycle
E0319 13:44:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:44:13.409796 543705 memory.go:191] Add success.
I0319 13:44:13.409814 543705 cpu.go:282] Add success.
W0319 13:44:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:44:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:44:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:44:13.420180 543705 net.go:648] Add success.
I0319 13:44:13.423030 543705 net.go:770] primary dev: ETH0
I0319 13:44:13.423046 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:44:13.423059 543705 net.go:698] Add success.
I0319 13:44:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:44:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:44:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0319 13:44:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:44:14.456578 543705 disk_worker.go:494] system disk:vda1
I0319 13:44:14.456618 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:44:15.456024 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:44:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:44:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:44:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:44:16.472407 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:44:19.430838 543705 disk_info.go:125] begin check local disk info of client
I0319 13:44:19.433297 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:44:19.433303 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007a7c0 0xc00007a940]
E0319 13:44:23.410416 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:44:23.410431 543705 memory.go:184] no items to output this cycle
I0319 13:44:23.410432 543705 cpu.go:275] no items to output this cycle
E0319 13:44:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:44:33.409795 543705 memory.go:184] no items to output this cycle
I0319 13:44:33.409812 543705 cpu.go:275] no items to output this cycle
E0319 13:44:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:44:43.409820 543705 memory.go:191] Add success.
I0319 13:44:43.409830 543705 cpu.go:282] Add success.
I0319 13:44:43.420305 543705 net.go:648] Add success.
I0319 13:44:43.422941 543705 net.go:770] primary dev: ETH0
I0319 13:44:43.422955 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:44:43.422967 543705 net.go:698] Add success.
I0319 13:44:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:44:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:44:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:44:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:44:53.409799 543705 memory.go:184] no items to output this cycle
I0319 13:44:53.409805 543705 cpu.go:275] no items to output this cycle
E0319 13:45:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:45:03.409773 543705 memory.go:184] no items to output this cycle
I0319 13:45:03.409780 543705 cpu.go:275] no items to output this cycle
E0319 13:45:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:45:13.409795 543705 memory.go:191] Add success.
I0319 13:45:13.409816 543705 cpu.go:282] Add success.
W0319 13:45:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:45:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:45:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:45:13.420134 543705 net.go:648] Add success.
I0319 13:45:13.423309 543705 net.go:770] primary dev: ETH0
I0319 13:45:13.423323 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:45:13.423338 543705 net.go:698] Add success.
I0319 13:45:13.463335 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"fdd5b097-c578-4636-9171-6eaf45f46cb8","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 13:45:13.463367 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 13:45:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:45:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:45:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0319 13:45:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:45:14.456623 543705 disk_worker.go:494] system disk:vda1
I0319 13:45:14.456652 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:45:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:45:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:45:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:45:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:45:16.472453 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:45:19.433858 543705 disk_info.go:125] begin check local disk info of client
I0319 13:45:19.436240 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:45:19.436246 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034c080 0xc00034c0c0]
E0319 13:45:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:45:23.409795 543705 memory.go:184] no items to output this cycle
I0319 13:45:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 13:45:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:45:33.409785 543705 memory.go:184] no items to output this cycle
I0319 13:45:33.409794 543705 cpu.go:275] no items to output this cycle
I0319 13:45:37.724602 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 13:45:37.724608 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 13:45:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:45:43.410704 543705 memory.go:191] Add success.
I0319 13:45:43.409813 543705 cpu.go:282] Add success.
I0319 13:45:43.420415 543705 net.go:648] Add success.
I0319 13:45:43.423109 543705 net.go:770] primary dev: ETH0
I0319 13:45:43.423121 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:45:43.423134 543705 net.go:698] Add success.
I0319 13:45:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:45:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:45:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:45:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:45:53.409801 543705 memory.go:184] no items to output this cycle
I0319 13:45:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 13:46:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:46:03.409772 543705 memory.go:184] no items to output this cycle
I0319 13:46:03.409789 543705 cpu.go:275] no items to output this cycle
E0319 13:46:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:46:13.409798 543705 memory.go:191] Add success.
I0319 13:46:13.409802 543705 cpu.go:282] Add success.
W0319 13:46:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:46:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:46:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:46:13.420266 543705 net.go:648] Add success.
I0319 13:46:13.423185 543705 net.go:770] primary dev: ETH0
I0319 13:46:13.423198 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:46:13.423211 543705 net.go:698] Add success.
I0319 13:46:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:46:14.455202 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:46:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0319 13:46:14.455216 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:46:14.456634 543705 disk_worker.go:494] system disk:vda1
I0319 13:46:14.456666 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:46:15.456016 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:46:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:46:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:46:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:46:16.472493 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:46:19.436877 543705 disk_info.go:125] begin check local disk info of client
I0319 13:46:19.439319 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:46:19.439325 543705 disk_info.go:196] parse disk info done, disk is : [0xc000544480 0xc0005444c0]
E0319 13:46:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:46:23.409771 543705 memory.go:184] no items to output this cycle
I0319 13:46:23.409776 543705 cpu.go:275] no items to output this cycle
E0319 13:46:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:46:33.409798 543705 memory.go:184] no items to output this cycle
I0319 13:46:33.409803 543705 cpu.go:275] no items to output this cycle
E0319 13:46:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:46:43.409815 543705 memory.go:191] Add success.
I0319 13:46:43.409823 543705 cpu.go:282] Add success.
I0319 13:46:43.419961 543705 net.go:648] Add success.
I0319 13:46:43.423098 543705 net.go:770] primary dev: ETH0
I0319 13:46:43.423111 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:46:43.423124 543705 net.go:698] Add success.
I0319 13:46:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:46:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:46:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:46:53.410509 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:46:53.410525 543705 memory.go:184] no items to output this cycle
I0319 13:46:53.410531 543705 cpu.go:275] no items to output this cycle
E0319 13:47:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:47:03.409774 543705 memory.go:184] no items to output this cycle
I0319 13:47:03.409778 543705 cpu.go:275] no items to output this cycle
E0319 13:47:13.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:47:13.409781 543705 memory.go:191] Add success.
W0319 13:47:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 13:47:13.409814 543705 cpu.go:282] Add success.
W0319 13:47:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:47:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:47:13.420120 543705 net.go:648] Add success.
I0319 13:47:13.423188 543705 net.go:770] primary dev: ETH0
I0319 13:47:13.423201 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:47:13.423213 543705 net.go:698] Add success.
I0319 13:47:13.453744 543705 event_worker.go:152] Polling the log file for events...
W0319 13:47:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:47:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0319 13:47:14.455189 543705 disk_worker.go:728] disk inode is not compliant
E0319 13:47:14.455911 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 13:47:14.455920 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 13:47:14.455926 543705 custom_config.go:64] query custom config with name: gpu
I0319 13:47:14.456568 543705 disk_worker.go:494] system disk:vda1
I0319 13:47:14.456598 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 13:47:15.456851 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 13:47:15.456859 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:47:16.457953 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 13:47:16.457961 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 13:47:16.458009 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:47:16.458026 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:47:16.472380 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:47:19.439877 543705 disk_info.go:125] begin check local disk info of client
I0319 13:47:19.442239 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:47:19.442246 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000d7440 0xc0000d7480]
E0319 13:47:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:47:23.409797 543705 memory.go:184] no items to output this cycle
I0319 13:47:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 13:47:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:47:33.409803 543705 memory.go:184] no items to output this cycle
I0319 13:47:33.409816 543705 cpu.go:275] no items to output this cycle
E0319 13:47:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:47:43.409819 543705 memory.go:191] Add success.
I0319 13:47:43.409836 543705 cpu.go:282] Add success.
I0319 13:47:43.419973 543705 net.go:648] Add success.
I0319 13:47:43.422711 543705 net.go:770] primary dev: ETH0
I0319 13:47:43.422726 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:47:43.422741 543705 net.go:698] Add success.
I0319 13:47:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:47:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:47:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:47:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:47:53.409797 543705 memory.go:184] no items to output this cycle
I0319 13:47:53.409809 543705 cpu.go:275] no items to output this cycle
E0319 13:48:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:48:03.409785 543705 memory.go:184] no items to output this cycle
I0319 13:48:03.409784 543705 cpu.go:275] no items to output this cycle
E0319 13:48:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:48:13.409801 543705 memory.go:191] Add success.
I0319 13:48:13.409803 543705 cpu.go:282] Add success.
W0319 13:48:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:48:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:48:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:48:13.420215 543705 net.go:648] Add success.
I0319 13:48:13.422978 543705 net.go:770] primary dev: ETH0
I0319 13:48:13.422991 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:48:13.423003 543705 net.go:698] Add success.
I0319 13:48:13.464058 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"cc6d80eb-0932-4ccf-9002-4e0f972051e7","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 13:48:13.464091 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 13:48:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:48:14.455164 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:48:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0319 13:48:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:48:14.456513 543705 disk_worker.go:494] system disk:vda1
I0319 13:48:14.456554 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:48:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:48:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:48:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:48:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:48:16.472413 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:48:19.442906 543705 disk_info.go:125] begin check local disk info of client
I0319 13:48:19.445265 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:48:19.445270 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001cccc0 0xc0001ccd00]
E0319 13:48:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:48:23.409789 543705 memory.go:184] no items to output this cycle
I0319 13:48:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 13:48:33.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:48:33.409826 543705 memory.go:184] no items to output this cycle
I0319 13:48:33.409836 543705 cpu.go:275] no items to output this cycle
I0319 13:48:37.725315 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 13:48:37.725322 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 13:48:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:48:43.410782 543705 memory.go:191] Add success.
I0319 13:48:43.409821 543705 cpu.go:282] Add success.
I0319 13:48:43.420301 543705 net.go:770] primary dev: ETH0
I0319 13:48:43.420314 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:48:43.420327 543705 net.go:698] Add success.
I0319 13:48:43.420674 543705 net.go:648] Add success.
I0319 13:48:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:48:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:48:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:48:53.410260 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:48:53.410280 543705 memory.go:184] no items to output this cycle
I0319 13:48:53.410297 543705 cpu.go:275] no items to output this cycle
E0319 13:49:03.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:49:03.409764 543705 memory.go:184] no items to output this cycle
I0319 13:49:03.409789 543705 cpu.go:275] no items to output this cycle
E0319 13:49:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:49:13.409819 543705 memory.go:191] Add success.
I0319 13:49:13.409827 543705 cpu.go:282] Add success.
W0319 13:49:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:49:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:49:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:49:13.420187 543705 net.go:648] Add success.
I0319 13:49:13.422987 543705 net.go:770] primary dev: ETH0
I0319 13:49:13.423000 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:49:13.423013 543705 net.go:698] Add success.
I0319 13:49:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:49:14.455168 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:49:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0319 13:49:14.455180 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:49:14.456564 543705 disk_worker.go:494] system disk:vda1
I0319 13:49:14.456593 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:49:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:49:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:49:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:49:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:49:16.472391 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:49:19.445919 543705 disk_info.go:125] begin check local disk info of client
I0319 13:49:19.448278 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:49:19.448284 543705 disk_info.go:196] parse disk info done, disk is : [0xc000252440 0xc000252480]
E0319 13:49:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:49:23.409788 543705 memory.go:184] no items to output this cycle
I0319 13:49:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 13:49:33.409857 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:49:33.409894 543705 memory.go:184] no items to output this cycle
I0319 13:49:33.410022 543705 cpu.go:275] no items to output this cycle
E0319 13:49:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:49:43.409825 543705 memory.go:191] Add success.
I0319 13:49:43.409836 543705 cpu.go:282] Add success.
I0319 13:49:43.419917 543705 net.go:648] Add success.
I0319 13:49:43.422770 543705 net.go:770] primary dev: ETH0
I0319 13:49:43.422783 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:49:43.422794 543705 net.go:698] Add success.
I0319 13:49:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:49:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:49:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:49:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:49:53.409766 543705 memory.go:184] no items to output this cycle
I0319 13:49:53.409790 543705 cpu.go:275] no items to output this cycle
E0319 13:50:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:50:03.409796 543705 memory.go:184] no items to output this cycle
I0319 13:50:03.409812 543705 cpu.go:275] no items to output this cycle
E0319 13:50:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:50:13.409827 543705 memory.go:191] Add success.
I0319 13:50:13.409830 543705 cpu.go:282] Add success.
W0319 13:50:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:50:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:50:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:50:13.420155 543705 net.go:648] Add success.
I0319 13:50:13.423239 543705 net.go:770] primary dev: ETH0
I0319 13:50:13.423252 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:50:13.423264 543705 net.go:698] Add success.
I0319 13:50:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:50:14.455160 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:50:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0319 13:50:14.455176 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:50:14.456513 543705 disk_worker.go:494] system disk:vda1
I0319 13:50:14.456557 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:50:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:50:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:50:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:50:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:50:16.472404 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:50:19.448938 543705 disk_info.go:125] begin check local disk info of client
I0319 13:50:19.451373 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:50:19.451379 543705 disk_info.go:196] parse disk info done, disk is : [0xc00028b0c0 0xc00028b100]
E0319 13:50:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:50:23.409794 543705 memory.go:184] no items to output this cycle
I0319 13:50:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 13:50:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:50:33.409788 543705 memory.go:184] no items to output this cycle
I0319 13:50:33.409817 543705 cpu.go:275] no items to output this cycle
E0319 13:50:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:50:43.409827 543705 memory.go:191] Add success.
I0319 13:50:43.409830 543705 cpu.go:282] Add success.
I0319 13:50:43.420060 543705 net.go:648] Add success.
I0319 13:50:43.423012 543705 net.go:770] primary dev: ETH0
I0319 13:50:43.423025 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:50:43.423037 543705 net.go:698] Add success.
I0319 13:50:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:50:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:50:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:50:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:50:53.409799 543705 memory.go:184] no items to output this cycle
I0319 13:50:53.409811 543705 cpu.go:275] no items to output this cycle
E0319 13:51:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:51:03.409777 543705 memory.go:184] no items to output this cycle
I0319 13:51:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 13:51:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:51:13.409801 543705 memory.go:191] Add success.
I0319 13:51:13.409803 543705 cpu.go:282] Add success.
W0319 13:51:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:51:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:51:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:51:13.420150 543705 net.go:648] Add success.
I0319 13:51:13.422694 543705 net.go:770] primary dev: ETH0
I0319 13:51:13.422709 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:51:13.422724 543705 net.go:698] Add success.
I0319 13:51:13.473748 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"cd531098-b312-43ea-b9ae-26555b27b2e6","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 13:51:13.473779 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 13:51:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:51:14.455200 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:51:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0319 13:51:14.455227 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:51:14.456611 543705 disk_worker.go:494] system disk:vda1
I0319 13:51:14.456646 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:51:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:51:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:51:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:51:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:51:16.472402 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:51:19.451942 543705 disk_info.go:125] begin check local disk info of client
I0319 13:51:19.454356 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:51:19.454362 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a6180 0xc0002a61c0]
E0319 13:51:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:51:23.409791 543705 memory.go:184] no items to output this cycle
I0319 13:51:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 13:51:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:51:33.409797 543705 memory.go:184] no items to output this cycle
I0319 13:51:33.409809 543705 cpu.go:275] no items to output this cycle
I0319 13:51:37.725739 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 13:51:37.725746 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 13:51:43.409882 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:51:43.410785 543705 memory.go:191] Add success.
I0319 13:51:43.409995 543705 cpu.go:282] Add success.
I0319 13:51:43.419730 543705 net.go:648] Add success.
I0319 13:51:43.422424 543705 net.go:770] primary dev: ETH0
I0319 13:51:43.422437 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:51:43.422449 543705 net.go:698] Add success.
I0319 13:51:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:51:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:51:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:51:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:51:53.409765 543705 memory.go:184] no items to output this cycle
I0319 13:51:53.409802 543705 cpu.go:275] no items to output this cycle
E0319 13:52:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:52:03.409795 543705 memory.go:184] no items to output this cycle
I0319 13:52:03.409808 543705 cpu.go:275] no items to output this cycle
E0319 13:52:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:52:13.409798 543705 memory.go:191] Add success.
I0319 13:52:13.409802 543705 cpu.go:282] Add success.
W0319 13:52:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:52:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:52:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:52:13.420072 543705 net.go:648] Add success.
I0319 13:52:13.423089 543705 net.go:770] primary dev: ETH0
I0319 13:52:13.423104 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:52:13.423117 543705 net.go:698] Add success.
W0319 13:52:14.455169 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:52:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0319 13:52:14.455182 543705 disk_worker.go:728] disk inode is not compliant
E0319 13:52:14.455872 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 13:52:14.455880 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 13:52:14.455886 543705 custom_config.go:64] query custom config with name: gpu
I0319 13:52:14.456569 543705 disk_worker.go:494] system disk:vda1
I0319 13:52:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 13:52:15.456859 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 13:52:15.456867 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:52:16.457899 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 13:52:16.457899 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 13:52:16.457957 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:52:16.457976 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:52:16.472288 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:52:19.454961 543705 disk_info.go:125] begin check local disk info of client
I0319 13:52:19.457289 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:52:19.457295 543705 disk_info.go:196] parse disk info done, disk is : [0xc00028a140 0xc00028a180]
E0319 13:52:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:52:23.409800 543705 memory.go:184] no items to output this cycle
I0319 13:52:23.409811 543705 cpu.go:275] no items to output this cycle
E0319 13:52:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:52:33.409807 543705 memory.go:184] no items to output this cycle
I0319 13:52:33.409820 543705 cpu.go:275] no items to output this cycle
E0319 13:52:43.409899 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:52:43.409936 543705 memory.go:191] Add success.
I0319 13:52:43.409945 543705 cpu.go:282] Add success.
I0319 13:52:43.419714 543705 net.go:648] Add success.
I0319 13:52:43.422444 543705 net.go:770] primary dev: ETH0
I0319 13:52:43.422457 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:52:43.422468 543705 net.go:698] Add success.
I0319 13:52:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:52:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:52:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:52:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:52:53.409800 543705 memory.go:184] no items to output this cycle
I0319 13:52:53.409811 543705 cpu.go:275] no items to output this cycle
E0319 13:53:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:53:03.409769 543705 memory.go:184] no items to output this cycle
I0319 13:53:03.409804 543705 cpu.go:275] no items to output this cycle
E0319 13:53:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:53:13.409823 543705 memory.go:191] Add success.
I0319 13:53:13.409826 543705 cpu.go:282] Add success.
W0319 13:53:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:53:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:53:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:53:13.420238 543705 net.go:648] Add success.
I0319 13:53:13.423589 543705 net.go:770] primary dev: ETH0
I0319 13:53:13.423604 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:53:13.423616 543705 net.go:698] Add success.
I0319 13:53:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:53:14.455174 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:53:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0319 13:53:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:53:14.456581 543705 disk_worker.go:494] system disk:vda1
I0319 13:53:14.456613 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:53:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:53:16.457999 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:53:16.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:53:16.458081 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:53:16.472421 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:53:19.457972 543705 disk_info.go:125] begin check local disk info of client
I0319 13:53:19.460329 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:53:19.460336 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a3440 0xc0004a3480]
E0319 13:53:23.410025 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:53:23.410041 543705 memory.go:184] no items to output this cycle
I0319 13:53:23.410116 543705 cpu.go:275] no items to output this cycle
E0319 13:53:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:53:33.409778 543705 memory.go:184] no items to output this cycle
I0319 13:53:33.409781 543705 cpu.go:275] no items to output this cycle
E0319 13:53:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:53:43.409798 543705 memory.go:191] Add success.
I0319 13:53:43.409799 543705 cpu.go:282] Add success.
I0319 13:53:43.420149 543705 net.go:648] Add success.
I0319 13:53:43.422980 543705 net.go:770] primary dev: ETH0
I0319 13:53:43.422994 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:53:43.423006 543705 net.go:698] Add success.
I0319 13:53:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:53:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:53:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:53:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:53:53.409800 543705 memory.go:184] no items to output this cycle
I0319 13:53:53.409811 543705 cpu.go:275] no items to output this cycle
E0319 13:54:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:54:03.409774 543705 memory.go:184] no items to output this cycle
I0319 13:54:03.409781 543705 cpu.go:275] no items to output this cycle
E0319 13:54:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:54:13.409800 543705 memory.go:191] Add success.
I0319 13:54:13.409806 543705 cpu.go:282] Add success.
W0319 13:54:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:54:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:54:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:54:13.420145 543705 net.go:648] Add success.
I0319 13:54:13.423047 543705 net.go:770] primary dev: ETH0
I0319 13:54:13.423060 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:54:13.423073 543705 net.go:698] Add success.
I0319 13:54:13.464069 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"125aaa0a-4311-47d3-821c-4cd77b89ed51","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 13:54:13.464105 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 13:54:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:54:14.455103 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:54:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0319 13:54:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:54:14.456501 543705 disk_worker.go:494] system disk:vda1
I0319 13:54:14.456555 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:54:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:54:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:54:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:54:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:54:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:54:19.460807 543705 disk_info.go:125] begin check local disk info of client
I0319 13:54:19.463176 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:54:19.463182 543705 disk_info.go:196] parse disk info done, disk is : [0xc0005323c0 0xc000532400]
E0319 13:54:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:54:23.409796 543705 memory.go:184] no items to output this cycle
I0319 13:54:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 13:54:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:54:33.409813 543705 memory.go:184] no items to output this cycle
I0319 13:54:33.409829 543705 cpu.go:275] no items to output this cycle
I0319 13:54:37.727337 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 13:54:37.727345 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 13:54:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:54:43.410696 543705 memory.go:191] Add success.
I0319 13:54:43.409824 543705 cpu.go:282] Add success.
I0319 13:54:43.420682 543705 net.go:648] Add success.
I0319 13:54:43.423382 543705 net.go:770] primary dev: ETH0
I0319 13:54:43.423396 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:54:43.423408 543705 net.go:698] Add success.
I0319 13:54:46.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:54:46.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:54:46.458055 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:54:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:54:53.409798 543705 memory.go:184] no items to output this cycle
I0319 13:54:53.409808 543705 cpu.go:275] no items to output this cycle
E0319 13:55:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:55:03.409777 543705 memory.go:184] no items to output this cycle
I0319 13:55:03.409782 543705 cpu.go:275] no items to output this cycle
E0319 13:55:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:55:13.409826 543705 memory.go:191] Add success.
I0319 13:55:13.409831 543705 cpu.go:282] Add success.
W0319 13:55:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:55:13.409876 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:55:13.409880 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:55:13.420120 543705 net.go:648] Add success.
I0319 13:55:13.422896 543705 net.go:770] primary dev: ETH0
I0319 13:55:13.422908 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:55:13.422920 543705 net.go:698] Add success.
I0319 13:55:14.454982 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:55:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:55:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0319 13:55:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:55:14.456599 543705 disk_worker.go:494] system disk:vda1
I0319 13:55:14.456643 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:55:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:55:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:55:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:55:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:55:16.472429 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:55:19.464003 543705 disk_info.go:125] begin check local disk info of client
I0319 13:55:19.466386 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:55:19.466392 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b740 0xc00007b780]
E0319 13:55:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:55:23.409793 543705 memory.go:184] no items to output this cycle
I0319 13:55:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 13:55:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:55:33.409781 543705 memory.go:184] no items to output this cycle
I0319 13:55:33.409787 543705 cpu.go:275] no items to output this cycle
E0319 13:55:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:55:43.409792 543705 memory.go:191] Add success.
I0319 13:55:43.409819 543705 cpu.go:282] Add success.
I0319 13:55:43.419992 543705 net.go:648] Add success.
I0319 13:55:43.422888 543705 net.go:770] primary dev: ETH0
I0319 13:55:43.422901 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:55:43.422913 543705 net.go:698] Add success.
I0319 13:55:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:55:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:55:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:55:53.410428 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:55:53.410503 543705 memory.go:184] no items to output this cycle
I0319 13:55:53.410566 543705 cpu.go:275] no items to output this cycle
E0319 13:56:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:56:03.409784 543705 memory.go:184] no items to output this cycle
I0319 13:56:03.409799 543705 cpu.go:275] no items to output this cycle
E0319 13:56:13.409804 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:56:13.409845 543705 memory.go:191] Add success.
I0319 13:56:13.409846 543705 cpu.go:282] Add success.
W0319 13:56:13.409875 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:56:13.409887 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:56:13.409890 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:56:13.420292 543705 net.go:648] Add success.
I0319 13:56:13.423062 543705 net.go:770] primary dev: ETH0
I0319 13:56:13.423077 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:56:13.423092 543705 net.go:698] Add success.
I0319 13:56:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:56:14.455090 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:56:14.455153 543705 disk_worker.go:708] disk space is not compliant
W0319 13:56:14.455156 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:56:14.456493 543705 disk_worker.go:494] system disk:vda1
I0319 13:56:14.456536 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:56:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:56:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:56:16.458024 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:56:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:56:16.472381 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:56:19.467084 543705 disk_info.go:125] begin check local disk info of client
I0319 13:56:19.469457 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:56:19.469463 543705 disk_info.go:196] parse disk info done, disk is : [0xc000498240 0xc000498280]
E0319 13:56:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:56:23.409808 543705 memory.go:184] no items to output this cycle
I0319 13:56:23.409813 543705 cpu.go:275] no items to output this cycle
E0319 13:56:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:56:33.409789 543705 memory.go:184] no items to output this cycle
I0319 13:56:33.409862 543705 cpu.go:275] no items to output this cycle
E0319 13:56:43.409802 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:56:43.409842 543705 memory.go:191] Add success.
I0319 13:56:43.409847 543705 cpu.go:282] Add success.
I0319 13:56:43.420072 543705 net.go:648] Add success.
I0319 13:56:43.422684 543705 net.go:770] primary dev: ETH0
I0319 13:56:43.422697 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:56:43.422709 543705 net.go:698] Add success.
I0319 13:56:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:56:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:56:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:56:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:56:53.409868 543705 memory.go:184] no items to output this cycle
I0319 13:56:53.409937 543705 cpu.go:275] no items to output this cycle
E0319 13:57:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:57:03.409776 543705 memory.go:184] no items to output this cycle
I0319 13:57:03.409806 543705 cpu.go:275] no items to output this cycle
E0319 13:57:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:57:13.409807 543705 memory.go:191] Add success.
I0319 13:57:13.409809 543705 cpu.go:282] Add success.
W0319 13:57:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:57:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:57:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:57:13.420163 543705 net.go:648] Add success.
I0319 13:57:13.423485 543705 net.go:770] primary dev: ETH0
I0319 13:57:13.423497 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:57:13.423509 543705 net.go:698] Add success.
I0319 13:57:13.429943 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 13:57:13.453114 543705 event_worker.go:152] Polling the log file for events...
I0319 13:57:13.468151 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c0363d42-c9b0-452e-a8aa-721c1da4a994","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 13:57:13.468193 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 13:57:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:57:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0319 13:57:14.455175 543705 disk_worker.go:728] disk inode is not compliant
E0319 13:57:14.456790 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 13:57:14.456807 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 13:57:14.456813 543705 custom_config.go:64] query custom config with name: gpu
I0319 13:57:14.456825 543705 disk_worker.go:494] system disk:vda1
I0319 13:57:14.456857 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 13:57:15.456788 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 13:57:15.456797 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:57:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 13:57:16.457976 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 13:57:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:57:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:57:16.472358 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:57:19.470041 543705 disk_info.go:125] begin check local disk info of client
I0319 13:57:19.472419 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:57:19.472425 543705 disk_info.go:196] parse disk info done, disk is : [0xc000499200 0xc000499240]
E0319 13:57:23.410389 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:57:23.410405 543705 memory.go:184] no items to output this cycle
I0319 13:57:23.410436 543705 cpu.go:275] no items to output this cycle
E0319 13:57:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:57:33.409803 543705 memory.go:184] no items to output this cycle
I0319 13:57:33.409816 543705 cpu.go:275] no items to output this cycle
I0319 13:57:37.728345 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 13:57:37.728352 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 13:57:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:57:43.410582 543705 memory.go:191] Add success.
I0319 13:57:43.409813 543705 cpu.go:282] Add success.
I0319 13:57:43.420256 543705 net.go:648] Add success.
I0319 13:57:43.423029 543705 net.go:770] primary dev: ETH0
I0319 13:57:43.423042 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:57:43.423053 543705 net.go:698] Add success.
I0319 13:57:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:57:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:57:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:57:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:57:53.409766 543705 memory.go:184] no items to output this cycle
I0319 13:57:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 13:58:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:58:03.409767 543705 memory.go:184] no items to output this cycle
I0319 13:58:03.409803 543705 cpu.go:275] no items to output this cycle
E0319 13:58:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:58:13.409801 543705 memory.go:191] Add success.
I0319 13:58:13.409807 543705 cpu.go:282] Add success.
W0319 13:58:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:58:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:58:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:58:13.420141 543705 net.go:648] Add success.
I0319 13:58:13.423016 543705 net.go:770] primary dev: ETH0
I0319 13:58:13.423029 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:58:13.423042 543705 net.go:698] Add success.
I0319 13:58:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:58:14.455201 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:58:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0319 13:58:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:58:14.456605 543705 disk_worker.go:494] system disk:vda1
I0319 13:58:14.456634 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:58:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:58:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:58:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:58:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:58:16.472455 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:58:19.472889 543705 disk_info.go:125] begin check local disk info of client
I0319 13:58:19.475304 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:58:19.475311 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b7cc0 0xc0003b7d00]
E0319 13:58:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:58:23.409762 543705 memory.go:184] no items to output this cycle
I0319 13:58:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 13:58:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:58:33.409776 543705 memory.go:184] no items to output this cycle
I0319 13:58:33.409806 543705 cpu.go:275] no items to output this cycle
E0319 13:58:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:58:43.409801 543705 memory.go:191] Add success.
I0319 13:58:43.409804 543705 cpu.go:282] Add success.
I0319 13:58:43.419871 543705 net.go:648] Add success.
I0319 13:58:43.422586 543705 net.go:770] primary dev: ETH0
I0319 13:58:43.422599 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:58:43.422611 543705 net.go:698] Add success.
I0319 13:58:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:58:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:58:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:58:53.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:58:53.409762 543705 memory.go:184] no items to output this cycle
I0319 13:58:53.409798 543705 cpu.go:275] no items to output this cycle
E0319 13:59:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:59:03.409768 543705 memory.go:184] no items to output this cycle
I0319 13:59:03.409790 543705 cpu.go:275] no items to output this cycle
E0319 13:59:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:59:13.409807 543705 memory.go:191] Add success.
I0319 13:59:13.409807 543705 cpu.go:282] Add success.
W0319 13:59:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 13:59:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 13:59:13.409855 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 13:59:13.420190 543705 net.go:648] Add success.
I0319 13:59:13.423080 543705 net.go:770] primary dev: ETH0
I0319 13:59:13.423092 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:59:13.423104 543705 net.go:698] Add success.
I0319 13:59:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0319 13:59:14.455210 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 13:59:14.455223 543705 disk_worker.go:708] disk space is not compliant
W0319 13:59:14.455226 543705 disk_worker.go:728] disk inode is not compliant
I0319 13:59:14.456591 543705 disk_worker.go:494] system disk:vda1
I0319 13:59:14.456622 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 13:59:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 13:59:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:59:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:59:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 13:59:16.472376 543705 disk_local_worker.go:436] Get disk info: []
I0319 13:59:19.476123 543705 disk_info.go:125] begin check local disk info of client
I0319 13:59:19.478587 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 13:59:19.478594 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329080 0xc0003290c0]
E0319 13:59:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:59:23.409758 543705 memory.go:184] no items to output this cycle
I0319 13:59:23.409793 543705 cpu.go:275] no items to output this cycle
E0319 13:59:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:59:33.409784 543705 memory.go:184] no items to output this cycle
I0319 13:59:33.409789 543705 cpu.go:275] no items to output this cycle
E0319 13:59:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:59:43.409817 543705 memory.go:191] Add success.
I0319 13:59:43.409820 543705 cpu.go:282] Add success.
I0319 13:59:43.419992 543705 net.go:648] Add success.
I0319 13:59:43.423048 543705 net.go:770] primary dev: ETH0
I0319 13:59:43.423062 543705 net.go:802] Send network stats successfully!,count is 6
I0319 13:59:43.423073 543705 net.go:698] Add success.
I0319 13:59:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 13:59:46.458065 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 13:59:46.458094 543705 gpu_kunlun.go:227] Add success, len:1
E0319 13:59:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 13:59:53.409777 543705 memory.go:184] no items to output this cycle
I0319 13:59:53.409779 543705 cpu.go:275] no items to output this cycle
E0319 14:00:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:00:03.409765 543705 memory.go:184] no items to output this cycle
I0319 14:00:03.409799 543705 cpu.go:275] no items to output this cycle
E0319 14:00:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:00:13.409818 543705 memory.go:191] Add success.
I0319 14:00:13.409836 543705 cpu.go:282] Add success.
W0319 14:00:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:00:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:00:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:00:13.420514 543705 net.go:648] Add success.
I0319 14:00:13.423450 543705 net.go:770] primary dev: ETH0
I0319 14:00:13.423465 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:00:13.423478 543705 net.go:698] Add success.
I0319 14:00:13.467826 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4746de8d-b88d-4912-b0f7-123f6208adc6","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 14:00:13.467862 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 14:00:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:00:14.455191 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:00:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0319 14:00:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:00:14.456586 543705 disk_worker.go:494] system disk:vda1
I0319 14:00:14.456615 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:00:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:00:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:00:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:00:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:00:16.472388 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:00:19.479082 543705 disk_info.go:125] begin check local disk info of client
I0319 14:00:19.481463 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:00:19.481469 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c49c0 0xc0000c4a00]
E0319 14:00:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:00:23.409792 543705 memory.go:184] no items to output this cycle
I0319 14:00:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 14:00:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:00:33.409779 543705 memory.go:184] no items to output this cycle
I0319 14:00:33.409811 543705 cpu.go:275] no items to output this cycle
I0319 14:00:37.729337 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 14:00:37.729345 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 14:00:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:00:43.410673 543705 memory.go:191] Add success.
I0319 14:00:43.409807 543705 cpu.go:282] Add success.
I0319 14:00:43.420376 543705 net.go:648] Add success.
I0319 14:00:43.423513 543705 net.go:770] primary dev: ETH0
I0319 14:00:43.423525 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:00:43.423537 543705 net.go:698] Add success.
I0319 14:00:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:00:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:00:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:00:53.410475 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:00:53.410493 543705 memory.go:184] no items to output this cycle
I0319 14:00:53.410507 543705 cpu.go:275] no items to output this cycle
E0319 14:01:03.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:01:03.409761 543705 memory.go:184] no items to output this cycle
I0319 14:01:03.409808 543705 cpu.go:275] no items to output this cycle
E0319 14:01:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:01:13.409817 543705 memory.go:191] Add success.
I0319 14:01:13.409817 543705 cpu.go:282] Add success.
W0319 14:01:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:01:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:01:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:01:13.420190 543705 net.go:648] Add success.
I0319 14:01:13.422910 543705 net.go:770] primary dev: ETH0
I0319 14:01:13.422923 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:01:13.422935 543705 net.go:698] Add success.
I0319 14:01:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:01:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:01:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0319 14:01:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:01:14.456559 543705 disk_worker.go:494] system disk:vda1
I0319 14:01:14.456590 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:01:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:01:16.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:01:16.458062 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:01:16.458084 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:01:16.472426 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:01:19.482099 543705 disk_info.go:125] begin check local disk info of client
I0319 14:01:19.484500 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:01:19.484506 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4c80 0xc0000c4cc0]
E0319 14:01:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:01:23.409763 543705 memory.go:184] no items to output this cycle
I0319 14:01:23.409797 543705 cpu.go:275] no items to output this cycle
E0319 14:01:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:01:33.409780 543705 memory.go:184] no items to output this cycle
I0319 14:01:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 14:01:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:01:43.409800 543705 memory.go:191] Add success.
I0319 14:01:43.409802 543705 cpu.go:282] Add success.
I0319 14:01:43.420177 543705 net.go:648] Add success.
I0319 14:01:43.423078 543705 net.go:770] primary dev: ETH0
I0319 14:01:43.423092 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:01:43.423105 543705 net.go:698] Add success.
I0319 14:01:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:01:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:01:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:01:53.409841 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:01:53.409867 543705 memory.go:184] no items to output this cycle
I0319 14:01:53.409959 543705 cpu.go:275] no items to output this cycle
E0319 14:02:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:02:03.409772 543705 memory.go:184] no items to output this cycle
I0319 14:02:03.409817 543705 cpu.go:275] no items to output this cycle
E0319 14:02:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:02:13.409790 543705 memory.go:191] Add success.
W0319 14:02:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 14:02:13.409823 543705 cpu.go:282] Add success.
W0319 14:02:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:02:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:02:13.420156 543705 net.go:648] Add success.
I0319 14:02:13.422701 543705 net.go:770] primary dev: ETH0
I0319 14:02:13.422717 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:02:13.422732 543705 net.go:698] Add success.
W0319 14:02:14.455165 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:02:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0319 14:02:14.455177 543705 disk_worker.go:728] disk inode is not compliant
E0319 14:02:14.456928 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 14:02:14.456937 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 14:02:14.456943 543705 custom_config.go:64] query custom config with name: gpu
I0319 14:02:14.457005 543705 disk_worker.go:494] system disk:vda1
I0319 14:02:14.457039 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 14:02:15.456810 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 14:02:15.456817 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:02:16.457910 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 14:02:16.457909 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 14:02:16.457963 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:02:16.457983 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:02:16.472299 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:02:19.485116 543705 disk_info.go:125] begin check local disk info of client
I0319 14:02:19.487465 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:02:19.487470 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5900 0xc0000c5940]
E0319 14:02:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:02:23.409795 543705 memory.go:184] no items to output this cycle
I0319 14:02:23.409810 543705 cpu.go:275] no items to output this cycle
E0319 14:02:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:02:33.409785 543705 memory.go:184] no items to output this cycle
I0319 14:02:33.409792 543705 cpu.go:275] no items to output this cycle
E0319 14:02:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:02:43.409802 543705 memory.go:191] Add success.
I0319 14:02:43.409803 543705 cpu.go:282] Add success.
I0319 14:02:43.420012 543705 net.go:648] Add success.
I0319 14:02:43.422526 543705 net.go:770] primary dev: ETH0
I0319 14:02:43.422540 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:02:43.422553 543705 net.go:698] Add success.
I0319 14:02:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:02:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:02:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:02:53.409833 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:02:53.409850 543705 memory.go:184] no items to output this cycle
I0319 14:02:53.409932 543705 cpu.go:275] no items to output this cycle
E0319 14:03:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:03:03.409766 543705 memory.go:184] no items to output this cycle
I0319 14:03:03.409806 543705 cpu.go:275] no items to output this cycle
E0319 14:03:13.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:03:13.409827 543705 memory.go:191] Add success.
I0319 14:03:13.409837 543705 cpu.go:282] Add success.
W0319 14:03:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:03:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:03:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:03:13.420237 543705 net.go:648] Add success.
I0319 14:03:13.423041 543705 net.go:770] primary dev: ETH0
I0319 14:03:13.423054 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:03:13.423066 543705 net.go:698] Add success.
I0319 14:03:13.469405 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"fef43cf8-4feb-4ec7-a269-03e9ef9d8e2b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 14:03:13.469436 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 14:03:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:03:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:03:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0319 14:03:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:03:14.456518 543705 disk_worker.go:494] system disk:vda1
I0319 14:03:14.456563 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:03:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:03:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:03:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:03:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:03:16.472498 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:03:19.488130 543705 disk_info.go:125] begin check local disk info of client
I0319 14:03:19.490559 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:03:19.490565 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329a00 0xc000329d80]
E0319 14:03:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:03:23.409778 543705 memory.go:184] no items to output this cycle
I0319 14:03:23.409782 543705 cpu.go:275] no items to output this cycle
E0319 14:03:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:03:33.409804 543705 memory.go:184] no items to output this cycle
I0319 14:03:33.409817 543705 cpu.go:275] no items to output this cycle
I0319 14:03:37.729733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 14:03:37.729740 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 14:03:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:03:43.411158 543705 memory.go:191] Add success.
I0319 14:03:43.409826 543705 cpu.go:282] Add success.
I0319 14:03:43.419830 543705 net.go:648] Add success.
I0319 14:03:43.422742 543705 net.go:770] primary dev: ETH0
I0319 14:03:43.422756 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:03:43.422770 543705 net.go:698] Add success.
I0319 14:03:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:03:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:03:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:03:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:03:53.409767 543705 memory.go:184] no items to output this cycle
I0319 14:03:53.409801 543705 cpu.go:275] no items to output this cycle
E0319 14:04:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:04:03.409805 543705 memory.go:184] no items to output this cycle
I0319 14:04:03.409819 543705 cpu.go:275] no items to output this cycle
E0319 14:04:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:04:13.409799 543705 memory.go:191] Add success.
W0319 14:04:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:04:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:04:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:04:13.409843 543705 cpu.go:282] Add success.
I0319 14:04:13.420389 543705 net.go:648] Add success.
I0319 14:04:13.423139 543705 net.go:770] primary dev: ETH0
I0319 14:04:13.423155 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:04:13.423169 543705 net.go:698] Add success.
I0319 14:04:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:04:14.455185 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:04:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0319 14:04:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:04:14.456588 543705 disk_worker.go:494] system disk:vda1
I0319 14:04:14.456619 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:04:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:04:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:04:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:04:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:04:16.472416 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:04:19.491138 543705 disk_info.go:125] begin check local disk info of client
I0319 14:04:19.493559 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:04:19.493566 543705 disk_info.go:196] parse disk info done, disk is : [0xc00052be40 0xc00052be80]
E0319 14:04:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:04:23.409790 543705 memory.go:184] no items to output this cycle
I0319 14:04:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 14:04:33.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:04:33.409814 543705 memory.go:184] no items to output this cycle
I0319 14:04:33.409826 543705 cpu.go:275] no items to output this cycle
E0319 14:04:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:04:43.409785 543705 memory.go:191] Add success.
I0319 14:04:43.409819 543705 cpu.go:282] Add success.
I0319 14:04:43.419912 543705 net.go:648] Add success.
I0319 14:04:43.422806 543705 net.go:770] primary dev: ETH0
I0319 14:04:43.422821 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:04:43.422836 543705 net.go:698] Add success.
I0319 14:04:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:04:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:04:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:04:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:04:53.409838 543705 memory.go:184] no items to output this cycle
I0319 14:04:53.409938 543705 cpu.go:275] no items to output this cycle
E0319 14:05:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:05:03.409789 543705 memory.go:184] no items to output this cycle
I0319 14:05:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 14:05:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:05:13.409806 543705 memory.go:191] Add success.
I0319 14:05:13.409808 543705 cpu.go:282] Add success.
W0319 14:05:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:05:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:05:13.409851 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:05:13.420155 543705 net.go:648] Add success.
I0319 14:05:13.422821 543705 net.go:770] primary dev: ETH0
I0319 14:05:13.422836 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:05:13.422847 543705 net.go:698] Add success.
I0319 14:05:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:05:14.455133 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:05:14.455233 543705 disk_worker.go:708] disk space is not compliant
W0319 14:05:14.455237 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:05:14.456638 543705 disk_worker.go:494] system disk:vda1
I0319 14:05:14.456668 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:05:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:05:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:05:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:05:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:05:16.472388 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:05:19.494160 543705 disk_info.go:125] begin check local disk info of client
I0319 14:05:19.496538 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:05:19.496544 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b6e80 0xc0003b6ec0]
E0319 14:05:23.410217 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:05:23.410235 543705 memory.go:184] no items to output this cycle
I0319 14:05:23.410257 543705 cpu.go:275] no items to output this cycle
E0319 14:05:33.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:05:33.409810 543705 memory.go:184] no items to output this cycle
I0319 14:05:33.409816 543705 cpu.go:275] no items to output this cycle
E0319 14:05:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:05:43.409821 543705 memory.go:191] Add success.
I0319 14:05:43.409825 543705 cpu.go:282] Add success.
I0319 14:05:43.420066 543705 net.go:648] Add success.
I0319 14:05:43.423008 543705 net.go:770] primary dev: ETH0
I0319 14:05:43.423022 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:05:43.423035 543705 net.go:698] Add success.
I0319 14:05:46.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:05:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:05:46.458056 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:05:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:05:53.409803 543705 memory.go:184] no items to output this cycle
I0319 14:05:53.409816 543705 cpu.go:275] no items to output this cycle
E0319 14:06:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:06:03.409773 543705 memory.go:184] no items to output this cycle
I0319 14:06:03.409778 543705 cpu.go:275] no items to output this cycle
E0319 14:06:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:06:13.409801 543705 memory.go:191] Add success.
I0319 14:06:13.409823 543705 cpu.go:282] Add success.
W0319 14:06:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:06:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:06:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:06:13.420117 543705 net.go:648] Add success.
I0319 14:06:13.422869 543705 net.go:770] primary dev: ETH0
I0319 14:06:13.422882 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:06:13.422893 543705 net.go:698] Add success.
I0319 14:06:13.468512 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e53133b4-5a6b-4ea1-a542-c418ee8bd446","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 14:06:13.468546 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 14:06:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:06:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:06:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0319 14:06:14.455176 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:06:14.456537 543705 disk_worker.go:494] system disk:vda1
I0319 14:06:14.456580 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:06:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:06:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:06:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:06:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:06:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:06:19.497172 543705 disk_info.go:125] begin check local disk info of client
I0319 14:06:19.499556 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:06:19.499562 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b6880 0xc0003b68c0]
E0319 14:06:23.410249 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:06:23.410265 543705 memory.go:184] no items to output this cycle
I0319 14:06:23.410298 543705 cpu.go:275] no items to output this cycle
E0319 14:06:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:06:33.409766 543705 memory.go:184] no items to output this cycle
I0319 14:06:33.409805 543705 cpu.go:275] no items to output this cycle
I0319 14:06:37.731367 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 14:06:37.731375 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 14:06:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:06:43.410766 543705 memory.go:191] Add success.
I0319 14:06:43.409802 543705 cpu.go:282] Add success.
I0319 14:06:43.420482 543705 net.go:648] Add success.
I0319 14:06:43.423544 543705 net.go:770] primary dev: ETH0
I0319 14:06:43.423560 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:06:43.423573 543705 net.go:698] Add success.
I0319 14:06:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:06:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:06:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:06:53.410365 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:06:53.410381 543705 memory.go:184] no items to output this cycle
I0319 14:06:53.410395 543705 cpu.go:275] no items to output this cycle
E0319 14:07:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:07:03.409791 543705 cpu.go:275] no items to output this cycle
I0319 14:07:03.409793 543705 memory.go:184] no items to output this cycle
E0319 14:07:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:07:13.409777 543705 memory.go:191] Add success.
W0319 14:07:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:07:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:07:13.409819 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:07:13.409821 543705 cpu.go:282] Add success.
I0319 14:07:13.420254 543705 net.go:648] Add success.
I0319 14:07:13.423418 543705 net.go:770] primary dev: ETH0
I0319 14:07:13.423433 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:07:13.423447 543705 net.go:698] Add success.
I0319 14:07:13.452971 543705 event_worker.go:152] Polling the log file for events...
W0319 14:07:14.455106 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:07:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0319 14:07:14.455168 543705 disk_worker.go:728] disk inode is not compliant
E0319 14:07:14.456875 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 14:07:14.456884 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 14:07:14.456890 543705 custom_config.go:64] query custom config with name: gpu
I0319 14:07:14.456974 543705 disk_worker.go:494] system disk:vda1
I0319 14:07:14.457015 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 14:07:15.456838 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 14:07:15.456846 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:07:16.457941 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 14:07:16.457941 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 14:07:16.457996 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:07:16.458016 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:07:16.472349 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:07:19.499984 543705 disk_info.go:125] begin check local disk info of client
I0319 14:07:19.502427 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:07:19.502433 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b7e80 0xc0003b7ec0]
E0319 14:07:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:07:23.409771 543705 memory.go:184] no items to output this cycle
I0319 14:07:23.409795 543705 cpu.go:275] no items to output this cycle
E0319 14:07:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:07:33.409773 543705 memory.go:184] no items to output this cycle
I0319 14:07:33.409803 543705 cpu.go:275] no items to output this cycle
E0319 14:07:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:07:43.409805 543705 memory.go:191] Add success.
I0319 14:07:43.409830 543705 cpu.go:282] Add success.
I0319 14:07:43.420014 543705 net.go:648] Add success.
I0319 14:07:43.422724 543705 net.go:770] primary dev: ETH0
I0319 14:07:43.422738 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:07:43.422750 543705 net.go:698] Add success.
I0319 14:07:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:07:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:07:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:07:53.409881 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:07:53.409893 543705 cpu.go:275] no items to output this cycle
I0319 14:07:53.409902 543705 memory.go:184] no items to output this cycle
E0319 14:08:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:08:03.409803 543705 memory.go:184] no items to output this cycle
I0319 14:08:03.409817 543705 cpu.go:275] no items to output this cycle
E0319 14:08:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:08:13.409793 543705 memory.go:191] Add success.
I0319 14:08:13.409794 543705 cpu.go:282] Add success.
W0319 14:08:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:08:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:08:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:08:13.420148 543705 net.go:648] Add success.
I0319 14:08:13.422782 543705 net.go:770] primary dev: ETH0
I0319 14:08:13.422797 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:08:13.422811 543705 net.go:698] Add success.
I0319 14:08:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:08:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:08:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0319 14:08:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:08:14.456610 543705 disk_worker.go:494] system disk:vda1
I0319 14:08:14.456642 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:08:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:08:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:08:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:08:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:08:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:08:19.503206 543705 disk_info.go:125] begin check local disk info of client
I0319 14:08:19.505577 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:08:19.505584 543705 disk_info.go:196] parse disk info done, disk is : [0xc000328e40 0xc000328e80]
E0319 14:08:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:08:23.409806 543705 memory.go:184] no items to output this cycle
I0319 14:08:23.409815 543705 cpu.go:275] no items to output this cycle
E0319 14:08:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:08:33.409779 543705 memory.go:184] no items to output this cycle
I0319 14:08:33.409789 543705 cpu.go:275] no items to output this cycle
E0319 14:08:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:08:43.409798 543705 memory.go:191] Add success.
I0319 14:08:43.409801 543705 cpu.go:282] Add success.
I0319 14:08:43.419927 543705 net.go:648] Add success.
I0319 14:08:43.422345 543705 net.go:770] primary dev: ETH0
I0319 14:08:43.422360 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:08:43.422376 543705 net.go:698] Add success.
I0319 14:08:46.457666 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:08:46.457729 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:08:46.457756 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:08:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:08:53.409765 543705 memory.go:184] no items to output this cycle
I0319 14:08:53.409800 543705 cpu.go:275] no items to output this cycle
E0319 14:09:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:09:03.409786 543705 memory.go:184] no items to output this cycle
I0319 14:09:03.409797 543705 cpu.go:275] no items to output this cycle
E0319 14:09:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:09:13.409789 543705 memory.go:191] Add success.
W0319 14:09:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 14:09:13.409820 543705 cpu.go:282] Add success.
W0319 14:09:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:09:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:09:13.420161 543705 net.go:648] Add success.
I0319 14:09:13.423234 543705 net.go:770] primary dev: ETH0
I0319 14:09:13.423247 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:09:13.423260 543705 net.go:698] Add success.
I0319 14:09:14.392766 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7c4e04c2-9b20-475f-9b13-759e43f45954","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 14:09:14.392801 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 14:09:14.454143 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:09:14.454331 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:09:14.454341 543705 disk_worker.go:708] disk space is not compliant
W0319 14:09:14.454343 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:09:14.455683 543705 disk_worker.go:494] system disk:vda1
I0319 14:09:14.455726 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:09:15.455609 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:09:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:09:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:09:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:09:16.472401 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:09:19.506223 543705 disk_info.go:125] begin check local disk info of client
I0319 14:09:19.508670 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:09:19.508678 543705 disk_info.go:196] parse disk info done, disk is : [0xc000356540 0xc000356580]
E0319 14:09:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:09:23.409802 543705 memory.go:184] no items to output this cycle
I0319 14:09:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 14:09:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:09:33.409774 543705 memory.go:184] no items to output this cycle
I0319 14:09:33.409787 543705 cpu.go:275] no items to output this cycle
I0319 14:09:37.732348 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 14:09:37.732355 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 14:09:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:09:43.410557 543705 memory.go:191] Add success.
I0319 14:09:43.409801 543705 cpu.go:282] Add success.
I0319 14:09:43.420287 543705 net.go:648] Add success.
I0319 14:09:43.422959 543705 net.go:770] primary dev: ETH0
I0319 14:09:43.422973 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:09:43.422985 543705 net.go:698] Add success.
I0319 14:09:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:09:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:09:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:09:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:09:53.409777 543705 memory.go:184] no items to output this cycle
I0319 14:09:53.409780 543705 cpu.go:275] no items to output this cycle
E0319 14:10:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:10:03.409777 543705 memory.go:184] no items to output this cycle
I0319 14:10:03.409782 543705 cpu.go:275] no items to output this cycle
E0319 14:10:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:10:13.409812 543705 memory.go:191] Add success.
I0319 14:10:13.409815 543705 cpu.go:282] Add success.
W0319 14:10:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:10:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:10:13.409855 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:10:13.420075 543705 net.go:648] Add success.
I0319 14:10:13.423144 543705 net.go:770] primary dev: ETH0
I0319 14:10:13.423158 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:10:13.423171 543705 net.go:698] Add success.
I0319 14:10:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:10:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:10:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0319 14:10:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:10:14.456520 543705 disk_worker.go:494] system disk:vda1
I0319 14:10:14.456565 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:10:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:10:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:10:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:10:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:10:16.472445 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:10:19.508762 543705 disk_info.go:125] begin check local disk info of client
I0319 14:10:19.511227 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:10:19.511234 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b7340 0xc0003b7380]
E0319 14:10:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:10:23.409767 543705 memory.go:184] no items to output this cycle
I0319 14:10:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 14:10:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:10:33.409787 543705 memory.go:184] no items to output this cycle
I0319 14:10:33.409787 543705 cpu.go:275] no items to output this cycle
E0319 14:10:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:10:43.409802 543705 memory.go:191] Add success.
I0319 14:10:43.409803 543705 cpu.go:282] Add success.
I0319 14:10:43.420058 543705 net.go:648] Add success.
I0319 14:10:43.422951 543705 net.go:770] primary dev: ETH0
I0319 14:10:43.422963 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:10:43.422977 543705 net.go:698] Add success.
I0319 14:10:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:10:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:10:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:10:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:10:53.409777 543705 memory.go:184] no items to output this cycle
I0319 14:10:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 14:11:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:11:03.409810 543705 memory.go:184] no items to output this cycle
I0319 14:11:03.409824 543705 cpu.go:275] no items to output this cycle
E0319 14:11:13.409810 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:11:13.409850 543705 memory.go:191] Add success.
I0319 14:11:13.409853 543705 cpu.go:282] Add success.
W0319 14:11:13.409882 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:11:13.409898 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:11:13.409902 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:11:13.420241 543705 net.go:648] Add success.
I0319 14:11:13.423062 543705 net.go:770] primary dev: ETH0
I0319 14:11:13.423077 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:11:13.423091 543705 net.go:698] Add success.
I0319 14:11:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:11:14.455105 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:11:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0319 14:11:14.455171 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:11:14.456498 543705 disk_worker.go:494] system disk:vda1
I0319 14:11:14.456541 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:11:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:11:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:11:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:11:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:11:16.472400 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:11:19.512257 543705 disk_info.go:125] begin check local disk info of client
I0319 14:11:19.514715 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:11:19.514722 543705 disk_info.go:196] parse disk info done, disk is : [0xc000356ac0 0xc000356b00]
E0319 14:11:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:11:23.409797 543705 memory.go:184] no items to output this cycle
I0319 14:11:23.409797 543705 cpu.go:275] no items to output this cycle
E0319 14:11:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:11:33.409766 543705 memory.go:184] no items to output this cycle
I0319 14:11:33.409805 543705 cpu.go:275] no items to output this cycle
E0319 14:11:43.409805 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:11:43.409847 543705 memory.go:191] Add success.
I0319 14:11:43.409847 543705 cpu.go:282] Add success.
I0319 14:11:43.419982 543705 net.go:648] Add success.
I0319 14:11:43.422947 543705 net.go:770] primary dev: ETH0
I0319 14:11:43.422962 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:11:43.422975 543705 net.go:698] Add success.
I0319 14:11:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:11:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:11:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:11:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:11:53.409783 543705 memory.go:184] no items to output this cycle
I0319 14:11:53.409786 543705 cpu.go:275] no items to output this cycle
E0319 14:12:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:12:03.409804 543705 memory.go:184] no items to output this cycle
I0319 14:12:03.409834 543705 cpu.go:275] no items to output this cycle
E0319 14:12:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:12:13.409787 543705 memory.go:191] Add success.
W0319 14:12:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 14:12:13.409813 543705 cpu.go:282] Add success.
W0319 14:12:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:12:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:12:13.419717 543705 net.go:648] Add success.
I0319 14:12:13.422849 543705 net.go:770] primary dev: ETH0
I0319 14:12:13.422864 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:12:13.422878 543705 net.go:698] Add success.
I0319 14:12:13.527713 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"84534fd2-1122-4215-8895-696e4549e61d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 14:12:13.527744 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 14:12:14.455227 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:12:14.455242 543705 disk_worker.go:708] disk space is not compliant
W0319 14:12:14.455246 543705 disk_worker.go:728] disk inode is not compliant
E0319 14:12:14.455923 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 14:12:14.455932 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 14:12:14.455937 543705 custom_config.go:64] query custom config with name: gpu
I0319 14:12:14.456855 543705 disk_worker.go:494] system disk:vda1
I0319 14:12:14.456884 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 14:12:15.456778 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 14:12:15.456787 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:12:16.458013 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 14:12:16.458013 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 14:12:16.458072 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:12:16.458093 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:12:16.472448 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:12:19.514805 543705 disk_info.go:125] begin check local disk info of client
I0319 14:12:19.517325 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:12:19.517333 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004dce40 0xc0004dce80]
E0319 14:12:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:12:23.409767 543705 memory.go:184] no items to output this cycle
I0319 14:12:23.409796 543705 cpu.go:275] no items to output this cycle
E0319 14:12:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:12:33.409798 543705 memory.go:184] no items to output this cycle
I0319 14:12:33.409813 543705 cpu.go:275] no items to output this cycle
I0319 14:12:37.733361 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 14:12:37.733369 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 14:12:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:12:43.410584 543705 memory.go:191] Add success.
I0319 14:12:43.409823 543705 cpu.go:282] Add success.
I0319 14:12:43.420346 543705 net.go:648] Add success.
I0319 14:12:43.422859 543705 net.go:770] primary dev: ETH0
I0319 14:12:43.422875 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:12:43.422898 543705 net.go:698] Add success.
I0319 14:12:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:12:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:12:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:12:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:12:53.409776 543705 memory.go:184] no items to output this cycle
I0319 14:12:53.409804 543705 cpu.go:275] no items to output this cycle
E0319 14:13:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:13:03.409776 543705 memory.go:184] no items to output this cycle
I0319 14:13:03.409778 543705 cpu.go:275] no items to output this cycle
E0319 14:13:13.409889 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:13:13.409918 543705 memory.go:191] Add success.
W0319 14:13:13.409960 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:13:13.409979 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:13:13.409982 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:13:13.410026 543705 cpu.go:282] Add success.
I0319 14:13:13.419738 543705 net.go:648] Add success.
I0319 14:13:13.422719 543705 net.go:770] primary dev: ETH0
I0319 14:13:13.422734 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:13:13.422747 543705 net.go:698] Add success.
I0319 14:13:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:13:14.455203 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:13:14.455213 543705 disk_worker.go:708] disk space is not compliant
W0319 14:13:14.455216 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:13:14.456585 543705 disk_worker.go:494] system disk:vda1
I0319 14:13:14.456615 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:13:15.456026 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:13:16.458054 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:13:16.458121 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:13:16.458150 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:13:16.472547 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:13:19.518297 543705 disk_info.go:125] begin check local disk info of client
I0319 14:13:19.520756 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:13:19.520762 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a1680 0xc0002a16c0]
E0319 14:13:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:13:23.409796 543705 memory.go:184] no items to output this cycle
I0319 14:13:23.409815 543705 cpu.go:275] no items to output this cycle
E0319 14:13:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:13:33.409781 543705 memory.go:184] no items to output this cycle
I0319 14:13:33.409787 543705 cpu.go:275] no items to output this cycle
E0319 14:13:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:13:43.409825 543705 memory.go:191] Add success.
I0319 14:13:43.409836 543705 cpu.go:282] Add success.
I0319 14:13:43.420134 543705 net.go:648] Add success.
I0319 14:13:43.423028 543705 net.go:770] primary dev: ETH0
I0319 14:13:43.423042 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:13:43.423055 543705 net.go:698] Add success.
I0319 14:13:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:13:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:13:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:13:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:13:53.409775 543705 memory.go:184] no items to output this cycle
I0319 14:13:53.409805 543705 cpu.go:275] no items to output this cycle
E0319 14:14:03.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:14:03.409762 543705 memory.go:184] no items to output this cycle
I0319 14:14:03.409791 543705 cpu.go:275] no items to output this cycle
E0319 14:14:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:14:13.409913 543705 cpu.go:282] Add success.
I0319 14:14:13.409917 543705 memory.go:191] Add success.
W0319 14:14:13.409952 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:14:13.409978 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:14:13.409981 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:14:13.419753 543705 net.go:648] Add success.
I0319 14:14:13.422916 543705 net.go:770] primary dev: ETH0
I0319 14:14:13.422929 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:14:13.422942 543705 net.go:698] Add success.
I0319 14:14:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:14:14.455185 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:14:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0319 14:14:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:14:14.456611 543705 disk_worker.go:494] system disk:vda1
I0319 14:14:14.456644 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:14:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:14:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:14:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:14:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:14:16.472442 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:14:19.521302 543705 disk_info.go:125] begin check local disk info of client
I0319 14:14:19.523793 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:14:19.523799 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b7780 0xc0003b77c0]
E0319 14:14:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:14:23.409769 543705 memory.go:184] no items to output this cycle
I0319 14:14:23.409789 543705 cpu.go:275] no items to output this cycle
E0319 14:14:33.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:14:33.409761 543705 memory.go:184] no items to output this cycle
I0319 14:14:33.409795 543705 cpu.go:275] no items to output this cycle
E0319 14:14:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:14:43.409796 543705 memory.go:191] Add success.
I0319 14:14:43.409797 543705 cpu.go:282] Add success.
I0319 14:14:43.419865 543705 net.go:648] Add success.
I0319 14:14:43.422534 543705 net.go:770] primary dev: ETH0
I0319 14:14:43.422549 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:14:43.422560 543705 net.go:698] Add success.
I0319 14:14:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:14:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:14:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:14:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:14:53.409778 543705 memory.go:184] no items to output this cycle
I0319 14:14:53.409781 543705 cpu.go:275] no items to output this cycle
E0319 14:15:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:15:03.409799 543705 memory.go:184] no items to output this cycle
I0319 14:15:03.409812 543705 cpu.go:275] no items to output this cycle
E0319 14:15:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:15:13.409787 543705 memory.go:191] Add success.
W0319 14:15:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:15:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:15:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:15:13.409837 543705 cpu.go:282] Add success.
I0319 14:15:13.420152 543705 net.go:770] primary dev: ETH0
I0319 14:15:13.420166 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:15:13.420180 543705 net.go:698] Add success.
I0319 14:15:13.420498 543705 net.go:648] Add success.
I0319 14:15:13.469232 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a6b89d40-b7ec-4288-be59-fa7ba1bbf83c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 14:15:13.469263 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 14:15:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:15:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:15:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0319 14:15:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:15:14.456538 543705 disk_worker.go:494] system disk:vda1
I0319 14:15:14.456582 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:15:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:15:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:15:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:15:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:15:16.472409 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:15:19.524262 543705 disk_info.go:125] begin check local disk info of client
I0319 14:15:19.526669 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:15:19.526676 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039d4c0 0xc00039d500]
E0319 14:15:23.410212 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:15:23.410228 543705 memory.go:184] no items to output this cycle
I0319 14:15:23.410237 543705 cpu.go:275] no items to output this cycle
E0319 14:15:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:15:33.409777 543705 memory.go:184] no items to output this cycle
I0319 14:15:33.409778 543705 cpu.go:275] no items to output this cycle
I0319 14:15:37.733733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 14:15:37.733739 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 14:15:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:15:43.410594 543705 memory.go:191] Add success.
I0319 14:15:43.409814 543705 cpu.go:282] Add success.
I0319 14:15:43.420301 543705 net.go:648] Add success.
I0319 14:15:43.423271 543705 net.go:770] primary dev: ETH0
I0319 14:15:43.423284 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:15:43.423296 543705 net.go:698] Add success.
I0319 14:15:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:15:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:15:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:15:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:15:53.409785 543705 memory.go:184] no items to output this cycle
I0319 14:15:53.409785 543705 cpu.go:275] no items to output this cycle
E0319 14:16:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:16:03.409773 543705 memory.go:184] no items to output this cycle
I0319 14:16:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 14:16:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:16:13.409797 543705 memory.go:191] Add success.
I0319 14:16:13.409798 543705 cpu.go:282] Add success.
W0319 14:16:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:16:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:16:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:16:13.419727 543705 net.go:648] Add success.
I0319 14:16:13.422314 543705 net.go:770] primary dev: ETH0
I0319 14:16:13.422327 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:16:13.422339 543705 net.go:698] Add success.
I0319 14:16:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:16:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:16:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0319 14:16:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:16:14.456497 543705 disk_worker.go:494] system disk:vda1
I0319 14:16:14.456541 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:16:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:16:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:16:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:16:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:16:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:16:19.527376 543705 disk_info.go:125] begin check local disk info of client
I0319 14:16:19.529772 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:16:19.529778 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa400 0xc0001aa480]
E0319 14:16:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:16:23.409799 543705 memory.go:184] no items to output this cycle
I0319 14:16:23.409813 543705 cpu.go:275] no items to output this cycle
E0319 14:16:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:16:33.409771 543705 memory.go:184] no items to output this cycle
I0319 14:16:33.409792 543705 cpu.go:275] no items to output this cycle
E0319 14:16:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:16:43.409826 543705 memory.go:191] Add success.
I0319 14:16:43.409832 543705 cpu.go:282] Add success.
I0319 14:16:43.419895 543705 net.go:648] Add success.
I0319 14:16:43.422850 543705 net.go:770] primary dev: ETH0
I0319 14:16:43.422865 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:16:43.422880 543705 net.go:698] Add success.
I0319 14:16:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:16:46.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:16:46.458056 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:16:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:16:53.409794 543705 memory.go:184] no items to output this cycle
I0319 14:16:53.409801 543705 cpu.go:275] no items to output this cycle
E0319 14:17:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:17:03.409773 543705 memory.go:184] no items to output this cycle
I0319 14:17:03.409782 543705 cpu.go:275] no items to output this cycle
W0319 14:17:13.409712 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:17:13.409734 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:17:13.409741 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:17:13.409829 543705 cpu.go:282] Add success.
E0319 14:17:13.409836 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:17:13.409852 543705 memory.go:191] Add success.
I0319 14:17:13.420378 543705 net.go:648] Add success.
I0319 14:17:13.423362 543705 net.go:770] primary dev: ETH0
I0319 14:17:13.423375 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:17:13.423386 543705 net.go:698] Add success.
I0319 14:17:13.452785 543705 event_worker.go:152] Polling the log file for events...
W0319 14:17:14.455111 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:17:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0319 14:17:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:17:14.456767 543705 disk_worker.go:494] system disk:vda1
I0319 14:17:14.456807 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 14:17:14.456979 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 14:17:14.456988 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 14:17:14.456993 543705 custom_config.go:64] query custom config with name: gpu
E0319 14:17:15.456773 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 14:17:15.456781 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:17:16.457871 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 14:17:16.457871 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 14:17:16.457923 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:17:16.457943 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:17:16.472255 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:17:19.530335 543705 disk_info.go:125] begin check local disk info of client
I0319 14:17:19.532560 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:17:19.532566 543705 disk_info.go:196] parse disk info done, disk is : [0xc000394c00 0xc000394c40]
E0319 14:17:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:17:23.409763 543705 memory.go:184] no items to output this cycle
I0319 14:17:23.409784 543705 cpu.go:275] no items to output this cycle
E0319 14:17:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:17:33.409795 543705 memory.go:184] no items to output this cycle
I0319 14:17:33.409808 543705 cpu.go:275] no items to output this cycle
E0319 14:17:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:17:43.409778 543705 memory.go:191] Add success.
I0319 14:17:43.409802 543705 cpu.go:282] Add success.
I0319 14:17:43.419896 543705 net.go:648] Add success.
I0319 14:17:43.422563 543705 net.go:770] primary dev: ETH0
I0319 14:17:43.422578 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:17:43.422592 543705 net.go:698] Add success.
I0319 14:17:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:17:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:17:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:17:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:17:53.409769 543705 memory.go:184] no items to output this cycle
I0319 14:17:53.409790 543705 cpu.go:275] no items to output this cycle
E0319 14:18:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:18:03.409798 543705 memory.go:184] no items to output this cycle
I0319 14:18:03.409808 543705 cpu.go:275] no items to output this cycle
E0319 14:18:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:18:13.409785 543705 memory.go:191] Add success.
I0319 14:18:13.409806 543705 cpu.go:282] Add success.
W0319 14:18:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:18:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:18:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:18:13.420241 543705 net.go:648] Add success.
I0319 14:18:13.423231 543705 net.go:770] primary dev: ETH0
I0319 14:18:13.423244 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:18:13.423255 543705 net.go:698] Add success.
I0319 14:18:13.468484 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9161e4d0-68d4-4c5e-8bf4-670f3ccb13bc","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 14:18:13.468516 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 14:18:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:18:14.455158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:18:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0319 14:18:14.455171 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:18:14.456517 543705 disk_worker.go:494] system disk:vda1
I0319 14:18:14.456548 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:18:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:18:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:18:16.458024 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:18:16.458046 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:18:16.472372 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:18:19.533411 543705 disk_info.go:125] begin check local disk info of client
I0319 14:18:19.535902 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:18:19.535909 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4b00 0xc0000c4b40]
E0319 14:18:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:18:23.409780 543705 cpu.go:275] no items to output this cycle
I0319 14:18:23.409785 543705 memory.go:184] no items to output this cycle
E0319 14:18:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:18:33.409780 543705 memory.go:184] no items to output this cycle
I0319 14:18:33.409787 543705 cpu.go:275] no items to output this cycle
I0319 14:18:37.733891 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 14:18:37.733899 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 14:18:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:18:43.410603 543705 memory.go:191] Add success.
I0319 14:18:43.409804 543705 cpu.go:282] Add success.
I0319 14:18:43.420335 543705 net.go:648] Add success.
I0319 14:18:43.422984 543705 net.go:770] primary dev: ETH0
I0319 14:18:43.422997 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:18:43.423010 543705 net.go:698] Add success.
I0319 14:18:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:18:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:18:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:18:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:18:53.409795 543705 memory.go:184] no items to output this cycle
I0319 14:18:53.409806 543705 cpu.go:275] no items to output this cycle
E0319 14:19:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:19:03.409795 543705 memory.go:184] no items to output this cycle
I0319 14:19:03.409808 543705 cpu.go:275] no items to output this cycle
E0319 14:19:13.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:19:13.409783 543705 memory.go:191] Add success.
W0319 14:19:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:19:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:19:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:19:13.409827 543705 cpu.go:282] Add success.
I0319 14:19:13.420426 543705 net.go:648] Add success.
I0319 14:19:13.423279 543705 net.go:770] primary dev: ETH0
I0319 14:19:13.423293 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:19:13.423304 543705 net.go:698] Add success.
I0319 14:19:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:19:14.455111 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:19:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 14:19:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:19:14.456578 543705 disk_worker.go:494] system disk:vda1
I0319 14:19:14.456605 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:19:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:19:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:19:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:19:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:19:16.472351 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:19:19.535992 543705 disk_info.go:125] begin check local disk info of client
I0319 14:19:19.538398 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:19:19.538404 543705 disk_info.go:196] parse disk info done, disk is : [0xc00056b380 0xc00056b3c0]
E0319 14:19:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:19:23.409790 543705 memory.go:184] no items to output this cycle
I0319 14:19:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 14:19:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:19:33.409780 543705 memory.go:184] no items to output this cycle
I0319 14:19:33.409801 543705 cpu.go:275] no items to output this cycle
E0319 14:19:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:19:43.409822 543705 memory.go:191] Add success.
I0319 14:19:43.409829 543705 cpu.go:282] Add success.
I0319 14:19:43.419965 543705 net.go:648] Add success.
I0319 14:19:43.422991 543705 net.go:770] primary dev: ETH0
I0319 14:19:43.423005 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:19:43.423018 543705 net.go:698] Add success.
I0319 14:19:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:19:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:19:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:19:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:19:53.409776 543705 memory.go:184] no items to output this cycle
I0319 14:19:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 14:20:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:20:03.409793 543705 memory.go:184] no items to output this cycle
I0319 14:20:03.409810 543705 cpu.go:275] no items to output this cycle
E0319 14:20:13.409892 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:20:13.409911 543705 cpu.go:282] Add success.
I0319 14:20:13.409920 543705 memory.go:191] Add success.
W0319 14:20:13.409954 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:20:13.409992 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:20:13.409998 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:20:13.419748 543705 net.go:648] Add success.
I0319 14:20:13.422693 543705 net.go:770] primary dev: ETH0
I0319 14:20:13.422708 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:20:13.422721 543705 net.go:698] Add success.
I0319 14:20:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:20:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:20:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0319 14:20:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:20:14.456565 543705 disk_worker.go:494] system disk:vda1
I0319 14:20:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:20:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:20:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:20:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:20:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:20:16.472439 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:20:19.539382 543705 disk_info.go:125] begin check local disk info of client
I0319 14:20:19.541799 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:20:19.541805 543705 disk_info.go:196] parse disk info done, disk is : [0xc00056ab00 0xc00056ab40]
E0319 14:20:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:20:23.409797 543705 memory.go:184] no items to output this cycle
I0319 14:20:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 14:20:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:20:33.409799 543705 memory.go:184] no items to output this cycle
I0319 14:20:33.409810 543705 cpu.go:275] no items to output this cycle
E0319 14:20:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:20:43.409794 543705 memory.go:191] Add success.
I0319 14:20:43.409815 543705 cpu.go:282] Add success.
I0319 14:20:43.419939 543705 net.go:648] Add success.
I0319 14:20:43.422653 543705 net.go:770] primary dev: ETH0
I0319 14:20:43.422665 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:20:43.422677 543705 net.go:698] Add success.
I0319 14:20:46.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:20:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:20:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:20:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:20:53.409804 543705 memory.go:184] no items to output this cycle
I0319 14:20:53.409812 543705 cpu.go:275] no items to output this cycle
E0319 14:21:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:21:03.409799 543705 memory.go:184] no items to output this cycle
I0319 14:21:03.409810 543705 cpu.go:275] no items to output this cycle
E0319 14:21:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:21:13.409810 543705 memory.go:191] Add success.
I0319 14:21:13.409811 543705 cpu.go:282] Add success.
W0319 14:21:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:21:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:21:13.409853 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:21:13.420222 543705 net.go:648] Add success.
I0319 14:21:13.422832 543705 net.go:770] primary dev: ETH0
I0319 14:21:13.422846 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:21:13.422858 543705 net.go:698] Add success.
I0319 14:21:13.463316 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"40b64f46-2995-4dff-9894-829b4c7be31b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 14:21:13.463350 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 14:21:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:21:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:21:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0319 14:21:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:21:14.456509 543705 disk_worker.go:494] system disk:vda1
I0319 14:21:14.456555 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:21:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:21:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:21:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:21:16.458076 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:21:16.472460 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:21:19.542353 543705 disk_info.go:125] begin check local disk info of client
I0319 14:21:19.544741 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:21:19.544748 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b6c40 0xc0003b6c80]
E0319 14:21:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:21:23.409793 543705 memory.go:184] no items to output this cycle
I0319 14:21:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 14:21:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:21:33.409797 543705 memory.go:184] no items to output this cycle
I0319 14:21:33.409810 543705 cpu.go:275] no items to output this cycle
I0319 14:21:37.734046 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 14:21:37.734053 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 14:21:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:21:43.410595 543705 memory.go:191] Add success.
I0319 14:21:43.409807 543705 cpu.go:282] Add success.
I0319 14:21:43.420288 543705 net.go:648] Add success.
I0319 14:21:43.422873 543705 net.go:770] primary dev: ETH0
I0319 14:21:43.422886 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:21:43.422900 543705 net.go:698] Add success.
I0319 14:21:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:21:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:21:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:21:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:21:53.409779 543705 memory.go:184] no items to output this cycle
I0319 14:21:53.409783 543705 cpu.go:275] no items to output this cycle
E0319 14:22:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:22:03.409766 543705 memory.go:184] no items to output this cycle
I0319 14:22:03.409794 543705 cpu.go:275] no items to output this cycle
E0319 14:22:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:22:13.409791 543705 memory.go:191] Add success.
W0319 14:22:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 14:22:13.409825 543705 cpu.go:282] Add success.
W0319 14:22:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:22:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:22:13.420123 543705 net.go:648] Add success.
I0319 14:22:13.422791 543705 net.go:770] primary dev: ETH0
I0319 14:22:13.422804 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:22:13.422817 543705 net.go:698] Add success.
W0319 14:22:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:22:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0319 14:22:14.455198 543705 disk_worker.go:728] disk inode is not compliant
E0319 14:22:14.456808 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 14:22:14.456817 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 14:22:14.456823 543705 custom_config.go:64] query custom config with name: gpu
I0319 14:22:14.456866 543705 disk_worker.go:494] system disk:vda1
I0319 14:22:14.456908 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 14:22:15.456843 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 14:22:15.456852 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:22:16.457952 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 14:22:16.457962 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 14:22:16.458005 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:22:16.458021 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:22:16.472352 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:22:19.545422 543705 disk_info.go:125] begin check local disk info of client
I0319 14:22:19.547781 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:22:19.547787 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab600 0xc0001ab640]
E0319 14:22:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:22:23.409794 543705 memory.go:184] no items to output this cycle
I0319 14:22:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 14:22:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:22:33.409773 543705 memory.go:184] no items to output this cycle
I0319 14:22:33.409783 543705 cpu.go:275] no items to output this cycle
E0319 14:22:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:22:43.409800 543705 memory.go:191] Add success.
I0319 14:22:43.409802 543705 cpu.go:282] Add success.
I0319 14:22:43.419989 543705 net.go:648] Add success.
I0319 14:22:43.422647 543705 net.go:770] primary dev: ETH0
I0319 14:22:43.422662 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:22:43.422677 543705 net.go:698] Add success.
I0319 14:22:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:22:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:22:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:22:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:22:53.409788 543705 cpu.go:275] no items to output this cycle
I0319 14:22:53.409791 543705 memory.go:184] no items to output this cycle
E0319 14:23:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:23:03.409771 543705 memory.go:184] no items to output this cycle
I0319 14:23:03.409803 543705 cpu.go:275] no items to output this cycle
E0319 14:23:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:23:13.409818 543705 memory.go:191] Add success.
I0319 14:23:13.409821 543705 cpu.go:282] Add success.
W0319 14:23:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:23:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:23:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:23:13.420295 543705 net.go:648] Add success.
I0319 14:23:13.423226 543705 net.go:770] primary dev: ETH0
I0319 14:23:13.423238 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:23:13.423264 543705 net.go:698] Add success.
I0319 14:23:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:23:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:23:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0319 14:23:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:23:14.456528 543705 disk_worker.go:494] system disk:vda1
I0319 14:23:14.456572 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:23:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:23:16.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:23:16.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:23:16.458076 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:23:16.472451 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:23:19.548379 543705 disk_info.go:125] begin check local disk info of client
I0319 14:23:19.550804 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:23:19.550810 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5200 0xc0000c5240]
E0319 14:23:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:23:23.409779 543705 cpu.go:275] no items to output this cycle
I0319 14:23:23.409781 543705 memory.go:184] no items to output this cycle
E0319 14:23:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:23:33.409798 543705 memory.go:184] no items to output this cycle
I0319 14:23:33.409812 543705 cpu.go:275] no items to output this cycle
E0319 14:23:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:23:43.409795 543705 memory.go:191] Add success.
I0319 14:23:43.409811 543705 cpu.go:282] Add success.
I0319 14:23:43.420276 543705 net.go:648] Add success.
I0319 14:23:43.423056 543705 net.go:770] primary dev: ETH0
I0319 14:23:43.423069 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:23:43.423080 543705 net.go:698] Add success.
I0319 14:23:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:23:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:23:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:23:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:23:53.409796 543705 memory.go:184] no items to output this cycle
I0319 14:23:53.409814 543705 cpu.go:275] no items to output this cycle
E0319 14:24:03.409840 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:24:03.409858 543705 memory.go:184] no items to output this cycle
I0319 14:24:03.409924 543705 cpu.go:275] no items to output this cycle
E0319 14:24:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:24:13.409804 543705 memory.go:191] Add success.
I0319 14:24:13.409807 543705 cpu.go:282] Add success.
W0319 14:24:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:24:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:24:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:24:13.420180 543705 net.go:648] Add success.
I0319 14:24:13.423015 543705 net.go:770] primary dev: ETH0
I0319 14:24:13.423028 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:24:13.423041 543705 net.go:698] Add success.
I0319 14:24:13.563774 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"49b38030-5282-4d61-9738-4cc3aa7ec8e2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 14:24:13.563807 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 14:24:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:24:14.455204 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:24:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0319 14:24:14.455217 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:24:14.456612 543705 disk_worker.go:494] system disk:vda1
I0319 14:24:14.456643 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:24:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:24:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:24:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:24:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:24:16.472434 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:24:19.551388 543705 disk_info.go:125] begin check local disk info of client
I0319 14:24:19.553771 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:24:19.553777 543705 disk_info.go:196] parse disk info done, disk is : [0xc00057d8c0 0xc00057d900]
E0319 14:24:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:24:23.409789 543705 memory.go:184] no items to output this cycle
I0319 14:24:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 14:24:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:24:33.409765 543705 memory.go:184] no items to output this cycle
I0319 14:24:33.409797 543705 cpu.go:275] no items to output this cycle
I0319 14:24:37.735378 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 14:24:37.735398 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 14:24:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:24:43.410672 543705 memory.go:191] Add success.
I0319 14:24:43.409801 543705 cpu.go:282] Add success.
I0319 14:24:43.420442 543705 net.go:648] Add success.
I0319 14:24:43.423263 543705 net.go:770] primary dev: ETH0
I0319 14:24:43.423277 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:24:43.423293 543705 net.go:698] Add success.
I0319 14:24:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:24:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:24:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:24:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:24:53.409778 543705 memory.go:184] no items to output this cycle
I0319 14:24:53.409781 543705 cpu.go:275] no items to output this cycle
E0319 14:25:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:25:03.409770 543705 memory.go:184] no items to output this cycle
I0319 14:25:03.409895 543705 cpu.go:275] no items to output this cycle
E0319 14:25:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:25:13.409830 543705 memory.go:191] Add success.
I0319 14:25:13.409832 543705 cpu.go:282] Add success.
W0319 14:25:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:25:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:25:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:25:13.420234 543705 net.go:648] Add success.
I0319 14:25:13.423546 543705 net.go:770] primary dev: ETH0
I0319 14:25:13.423575 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:25:13.423589 543705 net.go:698] Add success.
I0319 14:25:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:25:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:25:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0319 14:25:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:25:14.456594 543705 disk_worker.go:494] system disk:vda1
I0319 14:25:14.456622 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:25:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:25:16.458003 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:25:16.458066 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:25:16.458091 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:25:16.472410 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:25:19.554470 543705 disk_info.go:125] begin check local disk info of client
I0319 14:25:19.556853 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:25:19.556859 543705 disk_info.go:196] parse disk info done, disk is : [0xc00057cd00 0xc00057cd40]
E0319 14:25:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:25:23.409795 543705 memory.go:184] no items to output this cycle
I0319 14:25:23.409809 543705 cpu.go:275] no items to output this cycle
E0319 14:25:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:25:33.409780 543705 memory.go:184] no items to output this cycle
I0319 14:25:33.409782 543705 cpu.go:275] no items to output this cycle
E0319 14:25:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:25:43.409816 543705 memory.go:191] Add success.
I0319 14:25:43.409820 543705 cpu.go:282] Add success.
I0319 14:25:43.419871 543705 net.go:648] Add success.
I0319 14:25:43.422746 543705 net.go:770] primary dev: ETH0
I0319 14:25:43.422761 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:25:43.422773 543705 net.go:698] Add success.
I0319 14:25:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:25:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:25:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:25:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:25:53.409778 543705 memory.go:184] no items to output this cycle
I0319 14:25:53.409782 543705 cpu.go:275] no items to output this cycle
E0319 14:26:03.409889 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:26:03.409914 543705 memory.go:184] no items to output this cycle
I0319 14:26:03.409918 543705 cpu.go:275] no items to output this cycle
E0319 14:26:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:26:13.409795 543705 memory.go:191] Add success.
W0319 14:26:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 14:26:13.409831 543705 cpu.go:282] Add success.
W0319 14:26:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:26:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:26:13.420163 543705 net.go:648] Add success.
I0319 14:26:13.423249 543705 net.go:770] primary dev: ETH0
I0319 14:26:13.423264 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:26:13.423278 543705 net.go:698] Add success.
I0319 14:26:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:26:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:26:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0319 14:26:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:26:14.456596 543705 disk_worker.go:494] system disk:vda1
I0319 14:26:14.456626 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:26:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:26:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:26:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:26:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:26:16.472440 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:26:19.557428 543705 disk_info.go:125] begin check local disk info of client
I0319 14:26:19.559847 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:26:19.559853 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b7580 0xc0003b75c0]
E0319 14:26:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:26:23.409792 543705 memory.go:184] no items to output this cycle
I0319 14:26:23.409813 543705 cpu.go:275] no items to output this cycle
E0319 14:26:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:26:33.409772 543705 memory.go:184] no items to output this cycle
I0319 14:26:33.409793 543705 cpu.go:275] no items to output this cycle
E0319 14:26:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:26:43.409826 543705 memory.go:191] Add success.
I0319 14:26:43.409833 543705 cpu.go:282] Add success.
I0319 14:26:43.420013 543705 net.go:648] Add success.
I0319 14:26:43.422900 543705 net.go:770] primary dev: ETH0
I0319 14:26:43.422914 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:26:43.422928 543705 net.go:698] Add success.
I0319 14:26:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:26:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:26:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:26:53.410237 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:26:53.410261 543705 memory.go:184] no items to output this cycle
I0319 14:26:53.410279 543705 cpu.go:275] no items to output this cycle
E0319 14:27:03.409873 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:27:03.409895 543705 memory.go:184] no items to output this cycle
I0319 14:27:03.409963 543705 cpu.go:275] no items to output this cycle
E0319 14:27:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:27:13.409805 543705 memory.go:191] Add success.
I0319 14:27:13.409821 543705 cpu.go:282] Add success.
W0319 14:27:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:27:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:27:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:27:13.420138 543705 net.go:648] Add success.
I0319 14:27:13.428822 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 14:27:13.428900 543705 net.go:770] primary dev: ETH0
I0319 14:27:13.428918 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:27:13.428933 543705 net.go:698] Add success.
I0319 14:27:13.453455 543705 event_worker.go:152] Polling the log file for events...
I0319 14:27:13.468601 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"6b6e02f5-f55e-4f98-a4da-4035830a4317","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 14:27:13.468644 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 14:27:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:27:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0319 14:27:14.455174 543705 disk_worker.go:728] disk inode is not compliant
E0319 14:27:14.456891 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 14:27:14.456902 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 14:27:14.456907 543705 custom_config.go:64] query custom config with name: gpu
I0319 14:27:14.456977 543705 disk_worker.go:494] system disk:vda1
I0319 14:27:14.457019 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 14:27:15.456832 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 14:27:15.456841 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:27:16.457907 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 14:27:16.457907 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 14:27:16.457961 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:27:16.457980 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:27:16.472322 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:27:19.560507 543705 disk_info.go:125] begin check local disk info of client
I0319 14:27:19.562891 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:27:19.562897 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a84c0 0xc0004a8500]
E0319 14:27:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:27:23.409797 543705 memory.go:184] no items to output this cycle
I0319 14:27:23.409811 543705 cpu.go:275] no items to output this cycle
E0319 14:27:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:27:33.409783 543705 memory.go:184] no items to output this cycle
I0319 14:27:33.409815 543705 cpu.go:275] no items to output this cycle
I0319 14:27:37.736388 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 14:27:37.736394 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 14:27:43.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:27:43.410560 543705 memory.go:191] Add success.
I0319 14:27:43.409838 543705 cpu.go:282] Add success.
I0319 14:27:43.420255 543705 net.go:648] Add success.
I0319 14:27:43.422840 543705 net.go:770] primary dev: ETH0
I0319 14:27:43.422853 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:27:43.422865 543705 net.go:698] Add success.
I0319 14:27:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:27:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:27:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:27:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:27:53.409803 543705 memory.go:184] no items to output this cycle
I0319 14:27:53.409811 543705 cpu.go:275] no items to output this cycle
E0319 14:28:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:28:03.409794 543705 memory.go:184] no items to output this cycle
I0319 14:28:03.409813 543705 cpu.go:275] no items to output this cycle
E0319 14:28:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:28:13.409811 543705 memory.go:191] Add success.
I0319 14:28:13.409817 543705 cpu.go:282] Add success.
W0319 14:28:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:28:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:28:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:28:13.420316 543705 net.go:648] Add success.
I0319 14:28:13.423187 543705 net.go:770] primary dev: ETH0
I0319 14:28:13.423202 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:28:13.423216 543705 net.go:698] Add success.
I0319 14:28:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:28:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:28:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0319 14:28:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:28:14.456545 543705 disk_worker.go:494] system disk:vda1
I0319 14:28:14.456575 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:28:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:28:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:28:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:28:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:28:16.472404 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:28:19.562978 543705 disk_info.go:125] begin check local disk info of client
I0319 14:28:19.565367 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:28:19.565372 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a88c0 0xc0004a8900]
E0319 14:28:23.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:28:23.409820 543705 memory.go:184] no items to output this cycle
I0319 14:28:23.409828 543705 cpu.go:275] no items to output this cycle
E0319 14:28:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:28:33.409769 543705 memory.go:184] no items to output this cycle
I0319 14:28:33.409815 543705 cpu.go:275] no items to output this cycle
E0319 14:28:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:28:43.409821 543705 memory.go:191] Add success.
I0319 14:28:43.409838 543705 cpu.go:282] Add success.
I0319 14:28:43.420022 543705 net.go:648] Add success.
I0319 14:28:43.422817 543705 net.go:770] primary dev: ETH0
I0319 14:28:43.422859 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:28:43.422872 543705 net.go:698] Add success.
I0319 14:28:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:28:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:28:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:28:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:28:53.409804 543705 memory.go:184] no items to output this cycle
I0319 14:28:53.409823 543705 cpu.go:275] no items to output this cycle
E0319 14:29:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:29:03.409788 543705 memory.go:184] no items to output this cycle
I0319 14:29:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 14:29:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:29:13.409803 543705 memory.go:191] Add success.
I0319 14:29:13.409805 543705 cpu.go:282] Add success.
W0319 14:29:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:29:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:29:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:29:13.420232 543705 net.go:648] Add success.
I0319 14:29:13.423049 543705 net.go:770] primary dev: ETH0
I0319 14:29:13.423063 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:29:13.423075 543705 net.go:698] Add success.
I0319 14:29:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:29:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:29:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0319 14:29:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:29:14.456591 543705 disk_worker.go:494] system disk:vda1
I0319 14:29:14.456622 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:29:15.455977 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:29:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:29:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:29:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:29:16.472400 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:29:19.565465 543705 disk_info.go:125] begin check local disk info of client
I0319 14:29:19.567850 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:29:19.567856 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003293c0 0xc000329400]
E0319 14:29:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:29:23.409794 543705 memory.go:184] no items to output this cycle
I0319 14:29:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 14:29:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:29:33.409777 543705 memory.go:184] no items to output this cycle
I0319 14:29:33.409781 543705 cpu.go:275] no items to output this cycle
E0319 14:29:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:29:43.409790 543705 memory.go:191] Add success.
I0319 14:29:43.409809 543705 cpu.go:282] Add success.
I0319 14:29:43.420044 543705 net.go:648] Add success.
I0319 14:29:43.422910 543705 net.go:770] primary dev: ETH0
I0319 14:29:43.422923 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:29:43.422936 543705 net.go:698] Add success.
I0319 14:29:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:29:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:29:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:29:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:29:53.409802 543705 memory.go:184] no items to output this cycle
I0319 14:29:53.409811 543705 cpu.go:275] no items to output this cycle
E0319 14:30:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:30:03.409780 543705 memory.go:184] no items to output this cycle
I0319 14:30:03.409796 543705 cpu.go:275] no items to output this cycle
E0319 14:30:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:30:13.409800 543705 memory.go:191] Add success.
I0319 14:30:13.409802 543705 cpu.go:282] Add success.
W0319 14:30:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:30:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:30:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:30:13.420209 543705 net.go:648] Add success.
I0319 14:30:13.422907 543705 net.go:770] primary dev: ETH0
I0319 14:30:13.422919 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:30:13.422931 543705 net.go:698] Add success.
I0319 14:30:13.469214 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e9df7ce7-89cc-471e-b137-29af99d2df0e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 14:30:13.469251 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 14:30:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:30:14.455188 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:30:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0319 14:30:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:30:14.456734 543705 disk_worker.go:494] system disk:vda1
I0319 14:30:14.456764 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:30:15.455609 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:30:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:30:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:30:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:30:16.472424 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:30:19.568477 543705 disk_info.go:125] begin check local disk info of client
I0319 14:30:19.570908 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:30:19.570913 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab780 0xc0001ab7c0]
E0319 14:30:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:30:23.409809 543705 memory.go:184] no items to output this cycle
I0319 14:30:23.409810 543705 cpu.go:275] no items to output this cycle
E0319 14:30:33.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:30:33.409764 543705 memory.go:184] no items to output this cycle
I0319 14:30:33.409797 543705 cpu.go:275] no items to output this cycle
I0319 14:30:37.737391 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 14:30:37.737399 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 14:30:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:30:43.410670 543705 memory.go:191] Add success.
I0319 14:30:43.409816 543705 cpu.go:282] Add success.
I0319 14:30:43.420176 543705 net.go:770] primary dev: ETH0
I0319 14:30:43.420188 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:30:43.420202 543705 net.go:698] Add success.
I0319 14:30:43.420539 543705 net.go:648] Add success.
I0319 14:30:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:30:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:30:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:30:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:30:53.409799 543705 memory.go:184] no items to output this cycle
I0319 14:30:53.409815 543705 cpu.go:275] no items to output this cycle
I0319 14:31:03.409874 543705 cpu.go:275] no items to output this cycle
E0319 14:31:03.409874 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:31:03.409893 543705 memory.go:184] no items to output this cycle
E0319 14:31:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:31:13.409796 543705 memory.go:191] Add success.
I0319 14:31:13.409802 543705 cpu.go:282] Add success.
W0319 14:31:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:31:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:31:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:31:13.420117 543705 net.go:648] Add success.
I0319 14:31:13.422754 543705 net.go:770] primary dev: ETH0
I0319 14:31:13.422767 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:31:13.422779 543705 net.go:698] Add success.
I0319 14:31:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:31:14.455158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:31:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0319 14:31:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:31:14.456521 543705 disk_worker.go:494] system disk:vda1
I0319 14:31:14.456567 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:31:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:31:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:31:16.458025 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:31:16.458046 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:31:16.472363 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:31:19.571498 543705 disk_info.go:125] begin check local disk info of client
I0319 14:31:19.573962 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:31:19.573968 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007adc0 0xc00007ae00]
E0319 14:31:23.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:31:23.409760 543705 memory.go:184] no items to output this cycle
I0319 14:31:23.409790 543705 cpu.go:275] no items to output this cycle
E0319 14:31:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:31:33.409802 543705 memory.go:184] no items to output this cycle
I0319 14:31:33.409813 543705 cpu.go:275] no items to output this cycle
E0319 14:31:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:31:43.409794 543705 cpu.go:282] Add success.
I0319 14:31:43.409802 543705 memory.go:191] Add success.
I0319 14:31:43.419961 543705 net.go:648] Add success.
I0319 14:31:43.423065 543705 net.go:770] primary dev: ETH0
I0319 14:31:43.423079 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:31:43.423091 543705 net.go:698] Add success.
I0319 14:31:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:31:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:31:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:31:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:31:53.409766 543705 memory.go:184] no items to output this cycle
I0319 14:31:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 14:32:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:32:03.409786 543705 memory.go:184] no items to output this cycle
I0319 14:32:03.409791 543705 cpu.go:275] no items to output this cycle
E0319 14:32:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:32:13.409791 543705 memory.go:191] Add success.
W0319 14:32:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 14:32:13.409825 543705 cpu.go:282] Add success.
W0319 14:32:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:32:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:32:13.420120 543705 net.go:648] Add success.
I0319 14:32:13.422870 543705 net.go:770] primary dev: ETH0
I0319 14:32:13.422883 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:32:13.422895 543705 net.go:698] Add success.
W0319 14:32:14.455154 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:32:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0319 14:32:14.455167 543705 disk_worker.go:728] disk inode is not compliant
E0319 14:32:14.456918 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 14:32:14.456927 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 14:32:14.456934 543705 custom_config.go:64] query custom config with name: gpu
I0319 14:32:14.457000 543705 disk_worker.go:494] system disk:vda1
I0319 14:32:14.457029 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 14:32:15.456827 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 14:32:15.456836 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:32:16.457933 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 14:32:16.457942 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 14:32:16.457983 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:32:16.458000 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:32:16.472318 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:32:19.574514 543705 disk_info.go:125] begin check local disk info of client
I0319 14:32:19.576908 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:32:19.576913 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c46c0 0xc0000c4700]
E0319 14:32:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:32:23.409781 543705 memory.go:184] no items to output this cycle
I0319 14:32:23.409785 543705 cpu.go:275] no items to output this cycle
E0319 14:32:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:32:33.409804 543705 memory.go:184] no items to output this cycle
I0319 14:32:33.409815 543705 cpu.go:275] no items to output this cycle
E0319 14:32:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:32:43.409805 543705 memory.go:191] Add success.
I0319 14:32:43.409804 543705 cpu.go:282] Add success.
I0319 14:32:43.419952 543705 net.go:648] Add success.
I0319 14:32:43.422647 543705 net.go:770] primary dev: ETH0
I0319 14:32:43.422659 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:32:43.422672 543705 net.go:698] Add success.
I0319 14:32:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:32:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:32:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:32:53.410205 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:32:53.410220 543705 memory.go:184] no items to output this cycle
I0319 14:32:53.410229 543705 cpu.go:275] no items to output this cycle
E0319 14:33:03.409862 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:33:03.409865 543705 cpu.go:275] no items to output this cycle
I0319 14:33:03.409886 543705 memory.go:184] no items to output this cycle
E0319 14:33:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:33:13.409830 543705 memory.go:191] Add success.
I0319 14:33:13.409832 543705 cpu.go:282] Add success.
W0319 14:33:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:33:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:33:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:33:13.420197 543705 net.go:648] Add success.
I0319 14:33:13.423034 543705 net.go:770] primary dev: ETH0
I0319 14:33:13.423048 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:33:13.423060 543705 net.go:698] Add success.
I0319 14:33:13.464001 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"105e095c-5c5f-40b6-8189-f13fa5ec2c9a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 14:33:13.464035 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 14:33:14.454955 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:33:14.455152 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:33:14.455164 543705 disk_worker.go:708] disk space is not compliant
W0319 14:33:14.455167 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:33:14.456526 543705 disk_worker.go:494] system disk:vda1
I0319 14:33:14.456581 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:33:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:33:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:33:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:33:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:33:16.472406 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:33:19.577525 543705 disk_info.go:125] begin check local disk info of client
I0319 14:33:19.579964 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:33:19.579970 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4bc0 0xc0000c4c00]
E0319 14:33:23.409741 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:33:23.409758 543705 memory.go:184] no items to output this cycle
I0319 14:33:23.409796 543705 cpu.go:275] no items to output this cycle
E0319 14:33:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:33:33.409771 543705 memory.go:184] no items to output this cycle
I0319 14:33:33.409796 543705 cpu.go:275] no items to output this cycle
I0319 14:33:37.737733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 14:33:37.737740 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 14:33:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:33:43.410680 543705 memory.go:191] Add success.
I0319 14:33:43.409807 543705 cpu.go:282] Add success.
I0319 14:33:43.420445 543705 net.go:648] Add success.
I0319 14:33:43.423552 543705 net.go:770] primary dev: ETH0
I0319 14:33:43.423567 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:33:43.423578 543705 net.go:698] Add success.
I0319 14:33:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:33:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:33:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:33:53.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:33:53.409815 543705 memory.go:184] no items to output this cycle
I0319 14:33:53.409829 543705 cpu.go:275] no items to output this cycle
E0319 14:34:03.409894 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:34:03.409920 543705 memory.go:184] no items to output this cycle
I0319 14:34:03.409925 543705 cpu.go:275] no items to output this cycle
E0319 14:34:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:34:13.409830 543705 memory.go:191] Add success.
I0319 14:34:13.409857 543705 cpu.go:282] Add success.
W0319 14:34:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:34:13.413131 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:34:13.413137 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:34:13.419817 543705 net.go:648] Add success.
I0319 14:34:13.422235 543705 net.go:770] primary dev: ETH0
I0319 14:34:13.422252 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:34:13.422266 543705 net.go:698] Add success.
I0319 14:34:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:34:14.455204 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:34:14.455218 543705 disk_worker.go:708] disk space is not compliant
W0319 14:34:14.455221 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:34:14.456669 543705 disk_worker.go:494] system disk:vda1
I0319 14:34:14.456700 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:34:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:34:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:34:16.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:34:16.458087 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:34:16.472457 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:34:19.580053 543705 disk_info.go:125] begin check local disk info of client
I0319 14:34:19.582631 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:34:19.582637 543705 disk_info.go:196] parse disk info done, disk is : [0xc000217600 0xc000217640]
E0319 14:34:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:34:23.409802 543705 memory.go:184] no items to output this cycle
I0319 14:34:23.409813 543705 cpu.go:275] no items to output this cycle
E0319 14:34:33.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:34:33.409799 543705 cpu.go:275] no items to output this cycle
I0319 14:34:33.409808 543705 memory.go:184] no items to output this cycle
E0319 14:34:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:34:43.409809 543705 memory.go:191] Add success.
I0319 14:34:43.409810 543705 cpu.go:282] Add success.
I0319 14:34:43.419883 543705 net.go:648] Add success.
I0319 14:34:43.422694 543705 net.go:770] primary dev: ETH0
I0319 14:34:43.422707 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:34:43.422719 543705 net.go:698] Add success.
I0319 14:34:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:34:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:34:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:34:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:34:53.409806 543705 memory.go:184] no items to output this cycle
I0319 14:34:53.409826 543705 cpu.go:275] no items to output this cycle
E0319 14:35:03.409897 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:35:03.409939 543705 memory.go:184] no items to output this cycle
I0319 14:35:03.409918 543705 cpu.go:275] no items to output this cycle
W0319 14:35:13.409727 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:35:13.409744 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:35:13.409750 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:35:13.409811 543705 cpu.go:282] Add success.
E0319 14:35:13.409850 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:35:13.409873 543705 memory.go:191] Add success.
I0319 14:35:13.420344 543705 net.go:648] Add success.
I0319 14:35:13.423116 543705 net.go:770] primary dev: ETH0
I0319 14:35:13.423131 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:35:13.423146 543705 net.go:698] Add success.
I0319 14:35:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:35:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:35:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0319 14:35:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:35:14.456534 543705 disk_worker.go:494] system disk:vda1
I0319 14:35:14.456560 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:35:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:35:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:35:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:35:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:35:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:35:19.583599 543705 disk_info.go:125] begin check local disk info of client
I0319 14:35:19.586004 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:35:19.586010 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4700 0xc0000c4740]
E0319 14:35:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:35:23.409800 543705 memory.go:184] no items to output this cycle
I0319 14:35:23.409813 543705 cpu.go:275] no items to output this cycle
E0319 14:35:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:35:33.409790 543705 memory.go:184] no items to output this cycle
I0319 14:35:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 14:35:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:35:43.409801 543705 memory.go:191] Add success.
I0319 14:35:43.409802 543705 cpu.go:282] Add success.
I0319 14:35:43.419892 543705 net.go:648] Add success.
I0319 14:35:43.422481 543705 net.go:770] primary dev: ETH0
I0319 14:35:43.422494 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:35:43.422507 543705 net.go:698] Add success.
I0319 14:35:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:35:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:35:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:35:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:35:53.409787 543705 cpu.go:275] no items to output this cycle
I0319 14:35:53.409792 543705 memory.go:184] no items to output this cycle
E0319 14:36:03.409878 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:36:03.409899 543705 memory.go:184] no items to output this cycle
I0319 14:36:03.409900 543705 cpu.go:275] no items to output this cycle
E0319 14:36:13.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:36:13.409833 543705 memory.go:191] Add success.
I0319 14:36:13.409845 543705 cpu.go:282] Add success.
W0319 14:36:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:36:13.409881 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:36:13.409884 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:36:13.420175 543705 net.go:648] Add success.
I0319 14:36:13.422995 543705 net.go:770] primary dev: ETH0
I0319 14:36:13.423007 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:36:13.423019 543705 net.go:698] Add success.
I0319 14:36:13.469166 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0165d532-0813-4693-8160-9ab6c9a0cdaa","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 14:36:13.469199 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 14:36:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:36:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:36:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 14:36:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:36:14.456588 543705 disk_worker.go:494] system disk:vda1
I0319 14:36:14.456617 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:36:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:36:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:36:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:36:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:36:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:36:19.586622 543705 disk_info.go:125] begin check local disk info of client
I0319 14:36:19.589043 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:36:19.589049 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab200 0xc0001ab240]
E0319 14:36:23.410235 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:36:23.410250 543705 memory.go:184] no items to output this cycle
I0319 14:36:23.410278 543705 cpu.go:275] no items to output this cycle
E0319 14:36:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:36:33.409782 543705 memory.go:184] no items to output this cycle
I0319 14:36:33.409783 543705 cpu.go:275] no items to output this cycle
I0319 14:36:37.737900 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 14:36:37.737907 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 14:36:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:36:43.410687 543705 memory.go:191] Add success.
I0319 14:36:43.409846 543705 cpu.go:282] Add success.
I0319 14:36:43.420385 543705 net.go:648] Add success.
I0319 14:36:43.423212 543705 net.go:770] primary dev: ETH0
I0319 14:36:43.423226 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:36:43.423241 543705 net.go:698] Add success.
I0319 14:36:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:36:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:36:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:36:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:36:53.409790 543705 memory.go:184] no items to output this cycle
I0319 14:36:53.409799 543705 cpu.go:275] no items to output this cycle
E0319 14:37:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:37:03.409777 543705 memory.go:184] no items to output this cycle
I0319 14:37:03.409782 543705 cpu.go:275] no items to output this cycle
E0319 14:37:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:37:13.409807 543705 memory.go:191] Add success.
I0319 14:37:13.409809 543705 cpu.go:282] Add success.
W0319 14:37:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:37:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:37:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:37:13.420157 543705 net.go:648] Add success.
I0319 14:37:13.422810 543705 net.go:770] primary dev: ETH0
I0319 14:37:13.422823 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:37:13.422835 543705 net.go:698] Add success.
I0319 14:37:13.452780 543705 event_worker.go:152] Polling the log file for events...
W0319 14:37:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:37:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0319 14:37:14.455186 543705 disk_worker.go:728] disk inode is not compliant
E0319 14:37:14.455896 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 14:37:14.455905 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 14:37:14.455911 543705 custom_config.go:64] query custom config with name: gpu
I0319 14:37:14.456554 543705 disk_worker.go:494] system disk:vda1
I0319 14:37:14.456583 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 14:37:15.456859 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 14:37:15.456868 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:37:16.457924 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 14:37:16.457936 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 14:37:16.457977 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:37:16.457995 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:37:16.472333 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:37:19.589132 543705 disk_info.go:125] begin check local disk info of client
I0319 14:37:19.591503 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:37:19.591509 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aad40 0xc0001aad80]
E0319 14:37:23.410720 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:37:23.410734 543705 memory.go:184] no items to output this cycle
I0319 14:37:23.410737 543705 cpu.go:275] no items to output this cycle
E0319 14:37:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:37:33.409775 543705 memory.go:184] no items to output this cycle
I0319 14:37:33.409783 543705 cpu.go:275] no items to output this cycle
E0319 14:37:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:37:43.409818 543705 memory.go:191] Add success.
I0319 14:37:43.409821 543705 cpu.go:282] Add success.
I0319 14:37:43.419924 543705 net.go:648] Add success.
I0319 14:37:43.422736 543705 net.go:770] primary dev: ETH0
I0319 14:37:43.422750 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:37:43.422764 543705 net.go:698] Add success.
I0319 14:37:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:37:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:37:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:37:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:37:53.409784 543705 memory.go:184] no items to output this cycle
I0319 14:37:53.409800 543705 cpu.go:275] no items to output this cycle
E0319 14:38:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:38:03.409805 543705 memory.go:184] no items to output this cycle
I0319 14:38:03.409820 543705 cpu.go:275] no items to output this cycle
E0319 14:38:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:38:13.409800 543705 memory.go:191] Add success.
I0319 14:38:13.409808 543705 cpu.go:282] Add success.
W0319 14:38:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:38:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:38:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:38:13.420134 543705 net.go:648] Add success.
I0319 14:38:13.422903 543705 net.go:770] primary dev: ETH0
I0319 14:38:13.422915 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:38:13.422928 543705 net.go:698] Add success.
I0319 14:38:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:38:14.455194 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:38:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0319 14:38:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:38:14.456577 543705 disk_worker.go:494] system disk:vda1
I0319 14:38:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:38:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:38:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:38:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:38:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:38:16.472435 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:38:19.591605 543705 disk_info.go:125] begin check local disk info of client
I0319 14:38:19.594018 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:38:19.594024 543705 disk_info.go:196] parse disk info done, disk is : [0xc000356bc0 0xc000356c00]
E0319 14:38:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:38:23.409797 543705 memory.go:184] no items to output this cycle
I0319 14:38:23.409808 543705 cpu.go:275] no items to output this cycle
E0319 14:38:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:38:33.409764 543705 memory.go:184] no items to output this cycle
I0319 14:38:33.409792 543705 cpu.go:275] no items to output this cycle
E0319 14:38:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:38:43.409796 543705 memory.go:191] Add success.
I0319 14:38:43.409796 543705 cpu.go:282] Add success.
I0319 14:38:43.419976 543705 net.go:648] Add success.
I0319 14:38:43.423001 543705 net.go:770] primary dev: ETH0
I0319 14:38:43.423014 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:38:43.423025 543705 net.go:698] Add success.
I0319 14:38:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:38:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:38:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:38:53.410210 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:38:53.410340 543705 memory.go:184] no items to output this cycle
I0319 14:38:53.410343 543705 cpu.go:275] no items to output this cycle
E0319 14:39:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:39:03.409772 543705 memory.go:184] no items to output this cycle
I0319 14:39:03.409808 543705 cpu.go:275] no items to output this cycle
E0319 14:39:13.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:39:13.409828 543705 memory.go:191] Add success.
I0319 14:39:13.409830 543705 cpu.go:282] Add success.
W0319 14:39:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:39:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:39:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:39:13.420200 543705 net.go:648] Add success.
I0319 14:39:13.422805 543705 net.go:770] primary dev: ETH0
I0319 14:39:13.422818 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:39:13.422830 543705 net.go:698] Add success.
I0319 14:39:13.463997 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"98722457-5358-4142-898e-8541ef22a4c7","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 14:39:13.464029 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 14:39:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:39:14.455134 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:39:14.455218 543705 disk_worker.go:708] disk space is not compliant
W0319 14:39:14.455222 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:39:14.456731 543705 disk_worker.go:494] system disk:vda1
I0319 14:39:14.456763 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:39:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:39:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:39:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:39:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:39:16.472381 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:39:19.594609 543705 disk_info.go:125] begin check local disk info of client
I0319 14:39:19.597018 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:39:19.597025 543705 disk_info.go:196] parse disk info done, disk is : [0xc000585500 0xc000585540]
E0319 14:39:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:39:23.409796 543705 memory.go:184] no items to output this cycle
I0319 14:39:23.409811 543705 cpu.go:275] no items to output this cycle
E0319 14:39:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:39:33.409785 543705 memory.go:184] no items to output this cycle
I0319 14:39:33.409792 543705 cpu.go:275] no items to output this cycle
I0319 14:39:37.739391 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 14:39:37.739397 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 14:39:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:39:43.410638 543705 memory.go:191] Add success.
I0319 14:39:43.409812 543705 cpu.go:282] Add success.
I0319 14:39:43.420640 543705 net.go:648] Add success.
I0319 14:39:43.423398 543705 net.go:770] primary dev: ETH0
I0319 14:39:43.423411 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:39:43.423423 543705 net.go:698] Add success.
I0319 14:39:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:39:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:39:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:39:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:39:53.409790 543705 memory.go:184] no items to output this cycle
I0319 14:39:53.409790 543705 cpu.go:275] no items to output this cycle
E0319 14:40:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:40:03.409806 543705 memory.go:184] no items to output this cycle
I0319 14:40:03.409820 543705 cpu.go:275] no items to output this cycle
E0319 14:40:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:40:13.409802 543705 cpu.go:282] Add success.
I0319 14:40:13.409804 543705 memory.go:191] Add success.
W0319 14:40:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:40:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:40:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:40:13.420057 543705 net.go:648] Add success.
I0319 14:40:13.422672 543705 net.go:770] primary dev: ETH0
I0319 14:40:13.422685 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:40:13.422697 543705 net.go:698] Add success.
I0319 14:40:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:40:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:40:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 14:40:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:40:14.456593 543705 disk_worker.go:494] system disk:vda1
I0319 14:40:14.456623 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:40:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:40:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:40:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:40:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:40:16.472406 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:40:19.597619 543705 disk_info.go:125] begin check local disk info of client
I0319 14:40:19.600055 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:40:19.600061 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039b6c0 0xc00039b700]
E0319 14:40:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:40:23.409798 543705 memory.go:184] no items to output this cycle
I0319 14:40:23.409808 543705 cpu.go:275] no items to output this cycle
E0319 14:40:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:40:33.409779 543705 memory.go:184] no items to output this cycle
I0319 14:40:33.409801 543705 cpu.go:275] no items to output this cycle
E0319 14:40:43.409885 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:40:43.409924 543705 memory.go:191] Add success.
I0319 14:40:43.409979 543705 cpu.go:282] Add success.
I0319 14:40:43.419714 543705 net.go:648] Add success.
I0319 14:40:43.423230 543705 net.go:770] primary dev: ETH0
I0319 14:40:43.423243 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:40:43.423255 543705 net.go:698] Add success.
I0319 14:40:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:40:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:40:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:40:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:40:53.409787 543705 memory.go:184] no items to output this cycle
I0319 14:40:53.409796 543705 cpu.go:275] no items to output this cycle
E0319 14:41:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:41:03.409790 543705 memory.go:184] no items to output this cycle
I0319 14:41:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 14:41:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:41:13.409782 543705 memory.go:191] Add success.
W0319 14:41:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 14:41:13.409818 543705 cpu.go:282] Add success.
W0319 14:41:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:41:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:41:13.420376 543705 net.go:648] Add success.
I0319 14:41:13.423264 543705 net.go:770] primary dev: ETH0
I0319 14:41:13.423279 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:41:13.423291 543705 net.go:698] Add success.
I0319 14:41:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:41:14.455160 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:41:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0319 14:41:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:41:14.456518 543705 disk_worker.go:494] system disk:vda1
I0319 14:41:14.456562 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:41:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:41:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:41:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:41:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:41:16.472453 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:41:19.600144 543705 disk_info.go:125] begin check local disk info of client
I0319 14:41:19.602541 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:41:19.602550 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032a000 0xc00032a040]
E0319 14:41:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:41:23.409793 543705 memory.go:184] no items to output this cycle
I0319 14:41:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 14:41:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:41:33.409797 543705 memory.go:184] no items to output this cycle
I0319 14:41:33.409812 543705 cpu.go:275] no items to output this cycle
E0319 14:41:43.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:41:43.409830 543705 memory.go:191] Add success.
I0319 14:41:43.409840 543705 cpu.go:282] Add success.
I0319 14:41:43.419963 543705 net.go:648] Add success.
I0319 14:41:43.422695 543705 net.go:770] primary dev: ETH0
I0319 14:41:43.422710 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:41:43.422724 543705 net.go:698] Add success.
I0319 14:41:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:41:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:41:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:41:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:41:53.409804 543705 memory.go:184] no items to output this cycle
I0319 14:41:53.409813 543705 cpu.go:275] no items to output this cycle
E0319 14:42:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:42:03.409780 543705 memory.go:184] no items to output this cycle
I0319 14:42:03.409781 543705 cpu.go:275] no items to output this cycle
E0319 14:42:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:42:13.409798 543705 memory.go:191] Add success.
I0319 14:42:13.409799 543705 cpu.go:282] Add success.
W0319 14:42:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:42:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:42:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:42:13.420063 543705 net.go:648] Add success.
I0319 14:42:13.422961 543705 net.go:770] primary dev: ETH0
I0319 14:42:13.422976 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:42:13.422991 543705 net.go:698] Add success.
I0319 14:42:13.555969 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f83ddd21-ea4a-466b-b60b-a70dcc3e6744","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 14:42:13.556003 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 14:42:14.455102 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:42:14.455163 543705 disk_worker.go:708] disk space is not compliant
W0319 14:42:14.455166 543705 disk_worker.go:728] disk inode is not compliant
E0319 14:42:14.455963 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 14:42:14.455971 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 14:42:14.455976 543705 custom_config.go:64] query custom config with name: gpu
I0319 14:42:14.456469 543705 disk_worker.go:494] system disk:vda1
I0319 14:42:14.456501 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:42:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
E0319 14:42:15.456826 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 14:42:16.457916 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 14:42:16.457916 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 14:42:16.457972 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:42:16.457992 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:42:16.472316 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:42:19.602630 543705 disk_info.go:125] begin check local disk info of client
I0319 14:42:19.605034 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:42:19.605040 543705 disk_info.go:196] parse disk info done, disk is : [0xc000369b80 0xc000369bc0]
E0319 14:42:23.410703 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:42:23.410815 543705 memory.go:184] no items to output this cycle
I0319 14:42:23.410830 543705 cpu.go:275] no items to output this cycle
E0319 14:42:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:42:33.409783 543705 memory.go:184] no items to output this cycle
I0319 14:42:33.409797 543705 cpu.go:275] no items to output this cycle
I0319 14:42:37.740394 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 14:42:37.740401 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 14:42:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:42:43.410672 543705 memory.go:191] Add success.
I0319 14:42:43.409816 543705 cpu.go:282] Add success.
I0319 14:42:43.420372 543705 net.go:648] Add success.
I0319 14:42:43.423458 543705 net.go:770] primary dev: ETH0
I0319 14:42:43.423481 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:42:43.423496 543705 net.go:698] Add success.
I0319 14:42:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:42:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:42:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:42:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:42:53.409770 543705 memory.go:184] no items to output this cycle
I0319 14:42:53.409796 543705 cpu.go:275] no items to output this cycle
E0319 14:43:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:43:03.409799 543705 memory.go:184] no items to output this cycle
I0319 14:43:03.409811 543705 cpu.go:275] no items to output this cycle
E0319 14:43:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:43:13.409793 543705 memory.go:191] Add success.
I0319 14:43:13.409810 543705 cpu.go:282] Add success.
W0319 14:43:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:43:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:43:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:43:13.420142 543705 net.go:648] Add success.
I0319 14:43:13.422802 543705 net.go:770] primary dev: ETH0
I0319 14:43:13.422815 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:43:13.422829 543705 net.go:698] Add success.
I0319 14:43:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:43:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:43:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0319 14:43:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:43:14.456599 543705 disk_worker.go:494] system disk:vda1
I0319 14:43:14.456629 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:43:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:43:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:43:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:43:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:43:16.472392 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:43:19.605670 543705 disk_info.go:125] begin check local disk info of client
I0319 14:43:19.608038 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:43:19.608044 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000e6f40 0xc0000e6f80]
E0319 14:43:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:43:23.409791 543705 memory.go:184] no items to output this cycle
I0319 14:43:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 14:43:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:43:33.409804 543705 memory.go:184] no items to output this cycle
I0319 14:43:33.409818 543705 cpu.go:275] no items to output this cycle
E0319 14:43:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:43:43.409785 543705 memory.go:191] Add success.
I0319 14:43:43.409807 543705 cpu.go:282] Add success.
I0319 14:43:43.419928 543705 net.go:648] Add success.
I0319 14:43:43.422861 543705 net.go:770] primary dev: ETH0
I0319 14:43:43.422875 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:43:43.422889 543705 net.go:698] Add success.
I0319 14:43:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:43:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:43:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:43:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:43:53.409799 543705 memory.go:184] no items to output this cycle
I0319 14:43:53.409812 543705 cpu.go:275] no items to output this cycle
E0319 14:44:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:44:03.409768 543705 memory.go:184] no items to output this cycle
I0319 14:44:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 14:44:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:44:13.409822 543705 memory.go:191] Add success.
I0319 14:44:13.409831 543705 cpu.go:282] Add success.
W0319 14:44:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:44:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:44:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:44:13.420172 543705 net.go:648] Add success.
I0319 14:44:13.423291 543705 net.go:770] primary dev: ETH0
I0319 14:44:13.423304 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:44:13.423316 543705 net.go:698] Add success.
I0319 14:44:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:44:14.455129 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:44:14.455213 543705 disk_worker.go:708] disk space is not compliant
W0319 14:44:14.455217 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:44:14.456586 543705 disk_worker.go:494] system disk:vda1
I0319 14:44:14.456628 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:44:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:44:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:44:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:44:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:44:16.472441 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:44:19.608683 543705 disk_info.go:125] begin check local disk info of client
I0319 14:44:19.611173 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:44:19.611179 543705 disk_info.go:196] parse disk info done, disk is : [0xc000305180 0xc0003051c0]
E0319 14:44:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:44:23.409790 543705 memory.go:184] no items to output this cycle
I0319 14:44:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 14:44:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:44:33.409887 543705 memory.go:184] no items to output this cycle
I0319 14:44:33.409887 543705 cpu.go:275] no items to output this cycle
E0319 14:44:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:44:43.409800 543705 memory.go:191] Add success.
I0319 14:44:43.409843 543705 cpu.go:282] Add success.
I0319 14:44:43.420439 543705 net.go:648] Add success.
I0319 14:44:43.423311 543705 net.go:770] primary dev: ETH0
I0319 14:44:43.423324 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:44:43.423336 543705 net.go:698] Add success.
I0319 14:44:46.457993 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:44:46.458063 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:44:46.458091 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:44:53.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:44:53.409815 543705 memory.go:184] no items to output this cycle
I0319 14:44:53.409830 543705 cpu.go:275] no items to output this cycle
E0319 14:45:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:45:03.409771 543705 memory.go:184] no items to output this cycle
I0319 14:45:03.409817 543705 cpu.go:275] no items to output this cycle
E0319 14:45:13.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:45:13.409839 543705 memory.go:191] Add success.
I0319 14:45:13.409864 543705 cpu.go:282] Add success.
W0319 14:45:13.409873 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:45:13.409886 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:45:13.409890 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:45:13.420233 543705 net.go:648] Add success.
I0319 14:45:13.422927 543705 net.go:770] primary dev: ETH0
I0319 14:45:13.422940 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:45:13.422951 543705 net.go:698] Add success.
I0319 14:45:13.469966 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"bfcce61a-4060-465c-a824-b850097f06e8","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 14:45:13.470001 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 14:45:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:45:14.455220 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:45:14.455231 543705 disk_worker.go:708] disk space is not compliant
W0319 14:45:14.455234 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:45:14.456635 543705 disk_worker.go:494] system disk:vda1
I0319 14:45:14.456666 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:45:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:45:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:45:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:45:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:45:16.472381 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:45:19.611699 543705 disk_info.go:125] begin check local disk info of client
I0319 14:45:19.614099 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:45:19.614105 543705 disk_info.go:196] parse disk info done, disk is : [0xc000253500 0xc000253540]
E0319 14:45:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:45:23.409788 543705 memory.go:184] no items to output this cycle
I0319 14:45:23.409792 543705 cpu.go:275] no items to output this cycle
E0319 14:45:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:45:33.409812 543705 memory.go:184] no items to output this cycle
I0319 14:45:33.409829 543705 cpu.go:275] no items to output this cycle
I0319 14:45:37.741409 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 14:45:37.741415 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 14:45:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:45:43.410730 543705 memory.go:191] Add success.
I0319 14:45:43.409819 543705 cpu.go:282] Add success.
I0319 14:45:43.420533 543705 net.go:648] Add success.
I0319 14:45:43.423630 543705 net.go:770] primary dev: ETH0
I0319 14:45:43.423644 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:45:43.423657 543705 net.go:698] Add success.
I0319 14:45:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:45:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:45:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:45:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:45:53.409797 543705 memory.go:184] no items to output this cycle
I0319 14:45:53.409810 543705 cpu.go:275] no items to output this cycle
E0319 14:46:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:46:03.409768 543705 memory.go:184] no items to output this cycle
I0319 14:46:03.409802 543705 cpu.go:275] no items to output this cycle
E0319 14:46:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:46:13.409828 543705 memory.go:191] Add success.
I0319 14:46:13.409836 543705 cpu.go:282] Add success.
W0319 14:46:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:46:13.409878 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:46:13.409882 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:46:13.420138 543705 net.go:648] Add success.
I0319 14:46:13.423107 543705 net.go:770] primary dev: ETH0
I0319 14:46:13.423120 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:46:13.423132 543705 net.go:698] Add success.
I0319 14:46:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:46:14.455191 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:46:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0319 14:46:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:46:14.456568 543705 disk_worker.go:494] system disk:vda1
I0319 14:46:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:46:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:46:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:46:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:46:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:46:16.472393 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:46:19.614768 543705 disk_info.go:125] begin check local disk info of client
I0319 14:46:19.617149 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:46:19.617155 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003246c0 0xc000324700]
E0319 14:46:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:46:23.409798 543705 memory.go:184] no items to output this cycle
I0319 14:46:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 14:46:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:46:33.409770 543705 memory.go:184] no items to output this cycle
I0319 14:46:33.409793 543705 cpu.go:275] no items to output this cycle
E0319 14:46:43.409879 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:46:43.409920 543705 memory.go:191] Add success.
I0319 14:46:43.410014 543705 cpu.go:282] Add success.
I0319 14:46:43.419731 543705 net.go:648] Add success.
I0319 14:46:43.422396 543705 net.go:770] primary dev: ETH0
I0319 14:46:43.422409 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:46:43.422422 543705 net.go:698] Add success.
I0319 14:46:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:46:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:46:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:46:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:46:53.409770 543705 memory.go:184] no items to output this cycle
I0319 14:46:53.409790 543705 cpu.go:275] no items to output this cycle
E0319 14:47:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:47:03.409779 543705 memory.go:184] no items to output this cycle
I0319 14:47:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 14:47:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:47:13.409816 543705 memory.go:191] Add success.
I0319 14:47:13.409827 543705 cpu.go:282] Add success.
W0319 14:47:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:47:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:47:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:47:13.420149 543705 net.go:648] Add success.
I0319 14:47:13.423256 543705 net.go:770] primary dev: ETH0
I0319 14:47:13.423272 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:47:13.423287 543705 net.go:698] Add success.
I0319 14:47:13.452793 543705 event_worker.go:152] Polling the log file for events...
W0319 14:47:14.455160 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:47:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0319 14:47:14.455173 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:47:14.456784 543705 disk_worker.go:494] system disk:vda1
I0319 14:47:14.456825 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 14:47:14.457003 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 14:47:14.457012 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 14:47:14.457018 543705 custom_config.go:64] query custom config with name: gpu
I0319 14:47:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
E0319 14:47:15.456784 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 14:47:16.457940 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 14:47:16.457939 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 14:47:16.457996 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:47:16.458017 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:47:16.472338 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:47:19.617673 543705 disk_info.go:125] begin check local disk info of client
I0319 14:47:19.620016 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:47:19.620023 543705 disk_info.go:196] parse disk info done, disk is : [0xc000325800 0xc000325840]
E0319 14:47:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:47:23.409789 543705 memory.go:184] no items to output this cycle
I0319 14:47:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 14:47:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:47:33.409779 543705 memory.go:184] no items to output this cycle
I0319 14:47:33.409781 543705 cpu.go:275] no items to output this cycle
E0319 14:47:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:47:43.409793 543705 memory.go:191] Add success.
I0319 14:47:43.409796 543705 cpu.go:282] Add success.
I0319 14:47:43.419917 543705 net.go:648] Add success.
I0319 14:47:43.422777 543705 net.go:770] primary dev: ETH0
I0319 14:47:43.422792 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:47:43.422806 543705 net.go:698] Add success.
I0319 14:47:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:47:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:47:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:47:53.409883 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:47:53.409981 543705 cpu.go:275] no items to output this cycle
I0319 14:47:53.410040 543705 memory.go:184] no items to output this cycle
E0319 14:48:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:48:03.409776 543705 memory.go:184] no items to output this cycle
I0319 14:48:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 14:48:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:48:13.409794 543705 memory.go:191] Add success.
W0319 14:48:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 14:48:13.409827 543705 cpu.go:282] Add success.
W0319 14:48:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:48:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:48:13.420170 543705 net.go:648] Add success.
I0319 14:48:13.423158 543705 net.go:770] primary dev: ETH0
I0319 14:48:13.423173 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:48:13.423188 543705 net.go:698] Add success.
I0319 14:48:13.538188 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b07ba48b-0a88-4b7e-b013-52f472b35bd5","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 14:48:13.538222 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 14:48:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:48:14.455203 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:48:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0319 14:48:14.455216 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:48:14.456739 543705 disk_worker.go:494] system disk:vda1
I0319 14:48:14.456772 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:48:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:48:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:48:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:48:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:48:16.472399 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:48:19.620749 543705 disk_info.go:125] begin check local disk info of client
I0319 14:48:19.623177 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:48:19.623183 543705 disk_info.go:196] parse disk info done, disk is : [0xc000328800 0xc000328840]
E0319 14:48:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:48:23.409796 543705 memory.go:184] no items to output this cycle
I0319 14:48:23.409808 543705 cpu.go:275] no items to output this cycle
E0319 14:48:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:48:33.409779 543705 memory.go:184] no items to output this cycle
I0319 14:48:33.409782 543705 cpu.go:275] no items to output this cycle
I0319 14:48:37.741733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 14:48:37.741741 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 14:48:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:48:43.410607 543705 memory.go:191] Add success.
I0319 14:48:43.409797 543705 cpu.go:282] Add success.
I0319 14:48:43.420341 543705 net.go:648] Add success.
I0319 14:48:43.423056 543705 net.go:770] primary dev: ETH0
I0319 14:48:43.423069 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:48:43.423082 543705 net.go:698] Add success.
I0319 14:48:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:48:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:48:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:48:53.409861 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:48:53.409879 543705 memory.go:184] no items to output this cycle
I0319 14:48:53.410016 543705 cpu.go:275] no items to output this cycle
E0319 14:49:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:49:03.409808 543705 memory.go:184] no items to output this cycle
I0319 14:49:03.409817 543705 cpu.go:275] no items to output this cycle
E0319 14:49:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:49:13.409823 543705 memory.go:191] Add success.
I0319 14:49:13.409834 543705 cpu.go:282] Add success.
W0319 14:49:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:49:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:49:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:49:13.420176 543705 net.go:648] Add success.
I0319 14:49:13.423230 543705 net.go:770] primary dev: ETH0
I0319 14:49:13.423242 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:49:13.423255 543705 net.go:698] Add success.
I0319 14:49:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:49:14.455143 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:49:14.455155 543705 disk_worker.go:708] disk space is not compliant
W0319 14:49:14.455157 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:49:14.456511 543705 disk_worker.go:494] system disk:vda1
I0319 14:49:14.456555 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:49:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:49:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:49:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:49:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:49:16.472416 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:49:19.623761 543705 disk_info.go:125] begin check local disk info of client
I0319 14:49:19.626176 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:49:19.626183 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b7cc0 0xc0003b7d00]
E0319 14:49:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:49:23.409790 543705 memory.go:184] no items to output this cycle
I0319 14:49:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 14:49:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:49:33.409776 543705 memory.go:184] no items to output this cycle
I0319 14:49:33.409782 543705 cpu.go:275] no items to output this cycle
E0319 14:49:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:49:43.409803 543705 memory.go:191] Add success.
I0319 14:49:43.409803 543705 cpu.go:282] Add success.
I0319 14:49:43.419931 543705 net.go:648] Add success.
I0319 14:49:43.423019 543705 net.go:770] primary dev: ETH0
I0319 14:49:43.423034 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:49:43.423050 543705 net.go:698] Add success.
I0319 14:49:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:49:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:49:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:49:53.409904 543705 cpu.go:275] no items to output this cycle
E0319 14:49:53.409917 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:49:53.409961 543705 memory.go:184] no items to output this cycle
E0319 14:50:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:50:03.409766 543705 memory.go:184] no items to output this cycle
I0319 14:50:03.409806 543705 cpu.go:275] no items to output this cycle
E0319 14:50:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:50:13.409805 543705 memory.go:191] Add success.
I0319 14:50:13.409811 543705 cpu.go:282] Add success.
W0319 14:50:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:50:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:50:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:50:13.420219 543705 net.go:648] Add success.
I0319 14:50:13.423092 543705 net.go:770] primary dev: ETH0
I0319 14:50:13.423105 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:50:13.423117 543705 net.go:698] Add success.
I0319 14:50:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:50:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:50:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0319 14:50:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:50:14.456580 543705 disk_worker.go:494] system disk:vda1
I0319 14:50:14.456610 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:50:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:50:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:50:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:50:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:50:16.472441 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:50:19.626773 543705 disk_info.go:125] begin check local disk info of client
I0319 14:50:19.629186 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:50:19.629193 543705 disk_info.go:196] parse disk info done, disk is : [0xc000252cc0 0xc000252d00]
E0319 14:50:23.410405 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:50:23.410423 543705 memory.go:184] no items to output this cycle
I0319 14:50:23.410446 543705 cpu.go:275] no items to output this cycle
E0319 14:50:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:50:33.409773 543705 memory.go:184] no items to output this cycle
I0319 14:50:33.409800 543705 cpu.go:275] no items to output this cycle
E0319 14:50:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:50:43.409807 543705 memory.go:191] Add success.
I0319 14:50:43.409808 543705 cpu.go:282] Add success.
I0319 14:50:43.420041 543705 net.go:648] Add success.
I0319 14:50:43.422643 543705 net.go:770] primary dev: ETH0
I0319 14:50:43.422656 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:50:43.422669 543705 net.go:698] Add success.
I0319 14:50:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:50:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:50:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:50:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:50:53.409806 543705 memory.go:184] no items to output this cycle
I0319 14:50:53.409813 543705 cpu.go:275] no items to output this cycle
E0319 14:51:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:51:03.409809 543705 memory.go:184] no items to output this cycle
I0319 14:51:03.409821 543705 cpu.go:275] no items to output this cycle
E0319 14:51:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:51:13.409806 543705 memory.go:191] Add success.
I0319 14:51:13.409807 543705 cpu.go:282] Add success.
W0319 14:51:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:51:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:51:13.409851 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:51:13.420230 543705 net.go:648] Add success.
I0319 14:51:13.422826 543705 net.go:770] primary dev: ETH0
I0319 14:51:13.422841 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:51:13.422855 543705 net.go:698] Add success.
I0319 14:51:13.569682 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0b1ceefb-e043-4755-bf79-0def63b161bf","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 14:51:13.569716 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 14:51:14.454955 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:51:14.455103 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:51:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0319 14:51:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:51:14.456504 543705 disk_worker.go:494] system disk:vda1
I0319 14:51:14.456562 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:51:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:51:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:51:16.458024 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:51:16.458045 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:51:16.472410 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:51:19.629672 543705 disk_info.go:125] begin check local disk info of client
I0319 14:51:19.632056 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:51:19.632063 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b6740 0xc0003b6780]
E0319 14:51:23.410402 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:51:23.410416 543705 memory.go:184] no items to output this cycle
I0319 14:51:23.410449 543705 cpu.go:275] no items to output this cycle
E0319 14:51:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:51:33.409765 543705 memory.go:184] no items to output this cycle
I0319 14:51:33.409794 543705 cpu.go:275] no items to output this cycle
I0319 14:51:37.743415 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 14:51:37.743420 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 14:51:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:51:43.410662 543705 memory.go:191] Add success.
I0319 14:51:43.409828 543705 cpu.go:282] Add success.
I0319 14:51:43.420378 543705 net.go:648] Add success.
I0319 14:51:43.423102 543705 net.go:770] primary dev: ETH0
I0319 14:51:43.423114 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:51:43.423128 543705 net.go:698] Add success.
I0319 14:51:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:51:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:51:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:51:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:51:53.409803 543705 memory.go:184] no items to output this cycle
I0319 14:51:53.409815 543705 cpu.go:275] no items to output this cycle
E0319 14:52:03.409868 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:52:03.409895 543705 memory.go:184] no items to output this cycle
I0319 14:52:03.409964 543705 cpu.go:275] no items to output this cycle
E0319 14:52:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:52:13.409796 543705 memory.go:191] Add success.
I0319 14:52:13.409807 543705 cpu.go:282] Add success.
W0319 14:52:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:52:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:52:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:52:13.420052 543705 net.go:648] Add success.
I0319 14:52:13.422889 543705 net.go:770] primary dev: ETH0
I0319 14:52:13.422901 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:52:13.422913 543705 net.go:698] Add success.
W0319 14:52:14.455142 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:52:14.455153 543705 disk_worker.go:708] disk space is not compliant
W0319 14:52:14.455156 543705 disk_worker.go:728] disk inode is not compliant
E0319 14:52:14.456937 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 14:52:14.456946 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 14:52:14.456953 543705 custom_config.go:64] query custom config with name: gpu
I0319 14:52:14.457001 543705 disk_worker.go:494] system disk:vda1
I0319 14:52:14.457043 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 14:52:15.456841 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 14:52:15.456852 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:52:16.457937 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 14:52:16.457945 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 14:52:16.457990 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:52:16.458006 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:52:16.472343 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:52:19.632668 543705 disk_info.go:125] begin check local disk info of client
I0319 14:52:19.635111 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:52:19.635118 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003563c0 0xc000356400]
E0319 14:52:23.410392 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:52:23.410409 543705 memory.go:184] no items to output this cycle
I0319 14:52:23.410420 543705 cpu.go:275] no items to output this cycle
E0319 14:52:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:52:33.409773 543705 memory.go:184] no items to output this cycle
I0319 14:52:33.409791 543705 cpu.go:275] no items to output this cycle
E0319 14:52:43.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:52:43.409829 543705 memory.go:191] Add success.
I0319 14:52:43.409838 543705 cpu.go:282] Add success.
I0319 14:52:43.419925 543705 net.go:648] Add success.
I0319 14:52:43.422568 543705 net.go:770] primary dev: ETH0
I0319 14:52:43.422583 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:52:43.422598 543705 net.go:698] Add success.
I0319 14:52:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:52:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:52:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:52:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:52:53.409785 543705 memory.go:184] no items to output this cycle
I0319 14:52:53.409790 543705 cpu.go:275] no items to output this cycle
E0319 14:53:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:53:03.409790 543705 memory.go:184] no items to output this cycle
I0319 14:53:03.409792 543705 cpu.go:275] no items to output this cycle
E0319 14:53:13.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:53:13.409840 543705 memory.go:191] Add success.
I0319 14:53:13.409847 543705 cpu.go:282] Add success.
W0319 14:53:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:53:13.409890 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:53:13.409894 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:53:13.420201 543705 net.go:648] Add success.
I0319 14:53:13.422846 543705 net.go:770] primary dev: ETH0
I0319 14:53:13.422859 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:53:13.422871 543705 net.go:698] Add success.
I0319 14:53:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:53:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:53:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0319 14:53:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:53:14.456510 543705 disk_worker.go:494] system disk:vda1
I0319 14:53:14.456554 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:53:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:53:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:53:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:53:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:53:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:53:19.635198 543705 disk_info.go:125] begin check local disk info of client
I0319 14:53:19.637635 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:53:19.637641 543705 disk_info.go:196] parse disk info done, disk is : [0xc000357440 0xc000357480]
E0319 14:53:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:53:23.409771 543705 memory.go:184] no items to output this cycle
I0319 14:53:23.409797 543705 cpu.go:275] no items to output this cycle
E0319 14:53:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:53:33.409788 543705 memory.go:184] no items to output this cycle
I0319 14:53:33.409794 543705 cpu.go:275] no items to output this cycle
E0319 14:53:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:53:43.409811 543705 memory.go:191] Add success.
I0319 14:53:43.409813 543705 cpu.go:282] Add success.
I0319 14:53:43.420005 543705 net.go:648] Add success.
I0319 14:53:43.422796 543705 net.go:770] primary dev: ETH0
I0319 14:53:43.422811 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:53:43.422827 543705 net.go:698] Add success.
I0319 14:53:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:53:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:53:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:53:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:53:53.409790 543705 cpu.go:275] no items to output this cycle
I0319 14:53:53.409792 543705 memory.go:184] no items to output this cycle
E0319 14:54:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:54:03.409770 543705 memory.go:184] no items to output this cycle
I0319 14:54:03.409812 543705 cpu.go:275] no items to output this cycle
E0319 14:54:13.409985 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:54:13.410016 543705 memory.go:191] Add success.
W0319 14:54:13.410051 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:54:13.410073 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:54:13.410077 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:54:13.409984 543705 cpu.go:282] Add success.
I0319 14:54:13.419723 543705 net.go:648] Add success.
I0319 14:54:13.422413 543705 net.go:770] primary dev: ETH0
I0319 14:54:13.422425 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:54:13.422437 543705 net.go:698] Add success.
I0319 14:54:13.471571 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2732c29c-7950-49fa-aa21-c29b25140623","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 14:54:13.471613 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 14:54:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:54:14.455101 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:54:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0319 14:54:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:54:14.456691 543705 disk_worker.go:494] system disk:vda1
I0319 14:54:14.456725 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:54:15.455612 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:54:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:54:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:54:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:54:16.472374 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:54:19.637737 543705 disk_info.go:125] begin check local disk info of client
I0319 14:54:19.640198 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:54:19.640205 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aafc0 0xc0001ab000]
E0319 14:54:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:54:23.409763 543705 memory.go:184] no items to output this cycle
I0319 14:54:23.409790 543705 cpu.go:275] no items to output this cycle
E0319 14:54:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:54:33.409800 543705 memory.go:184] no items to output this cycle
I0319 14:54:33.409809 543705 cpu.go:275] no items to output this cycle
I0319 14:54:37.743566 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 14:54:37.743574 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 14:54:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:54:43.410801 543705 memory.go:191] Add success.
I0319 14:54:43.409806 543705 cpu.go:282] Add success.
I0319 14:54:43.420564 543705 net.go:648] Add success.
I0319 14:54:43.423362 543705 net.go:770] primary dev: ETH0
I0319 14:54:43.423376 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:54:43.423389 543705 net.go:698] Add success.
I0319 14:54:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:54:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:54:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:54:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:54:53.409777 543705 cpu.go:275] no items to output this cycle
I0319 14:54:53.409778 543705 memory.go:184] no items to output this cycle
E0319 14:55:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:55:03.409802 543705 memory.go:184] no items to output this cycle
I0319 14:55:03.409821 543705 cpu.go:275] no items to output this cycle
W0319 14:55:13.409720 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:55:13.409740 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:55:13.409747 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:55:13.409810 543705 cpu.go:282] Add success.
E0319 14:55:13.409842 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:55:13.409865 543705 memory.go:191] Add success.
I0319 14:55:13.420070 543705 net.go:648] Add success.
I0319 14:55:13.423142 543705 net.go:770] primary dev: ETH0
I0319 14:55:13.423158 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:55:13.423173 543705 net.go:698] Add success.
I0319 14:55:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:55:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:55:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0319 14:55:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:55:14.456569 543705 disk_worker.go:494] system disk:vda1
I0319 14:55:14.456598 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:55:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:55:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:55:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:55:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:55:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:55:19.640905 543705 disk_info.go:125] begin check local disk info of client
I0319 14:55:19.643356 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:55:19.643364 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5f40 0xc000356000]
E0319 14:55:23.410263 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:55:23.410281 543705 memory.go:184] no items to output this cycle
I0319 14:55:23.410289 543705 cpu.go:275] no items to output this cycle
E0319 14:55:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:55:33.409791 543705 memory.go:184] no items to output this cycle
I0319 14:55:33.409791 543705 cpu.go:275] no items to output this cycle
E0319 14:55:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:55:43.409789 543705 memory.go:191] Add success.
I0319 14:55:43.409808 543705 cpu.go:282] Add success.
I0319 14:55:43.420012 543705 net.go:648] Add success.
I0319 14:55:43.422874 543705 net.go:770] primary dev: ETH0
I0319 14:55:43.422887 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:55:43.422899 543705 net.go:698] Add success.
I0319 14:55:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:55:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:55:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:55:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:55:53.409797 543705 memory.go:184] no items to output this cycle
I0319 14:55:53.409812 543705 cpu.go:275] no items to output this cycle
E0319 14:56:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:56:03.409766 543705 memory.go:184] no items to output this cycle
I0319 14:56:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 14:56:13.409859 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:56:13.409894 543705 memory.go:191] Add success.
W0319 14:56:13.409928 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 14:56:13.409944 543705 cpu.go:282] Add success.
W0319 14:56:13.409991 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:56:13.409996 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:56:13.419709 543705 net.go:648] Add success.
I0319 14:56:13.422724 543705 net.go:770] primary dev: ETH0
I0319 14:56:13.422738 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:56:13.422751 543705 net.go:698] Add success.
I0319 14:56:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:56:14.455200 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:56:14.455211 543705 disk_worker.go:708] disk space is not compliant
W0319 14:56:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:56:14.456573 543705 disk_worker.go:494] system disk:vda1
I0319 14:56:14.456606 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:56:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:56:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:56:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:56:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:56:16.472434 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:56:19.643444 543705 disk_info.go:125] begin check local disk info of client
I0319 14:56:19.645901 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:56:19.645908 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004db300 0xc0004db340]
E0319 14:56:23.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:56:23.409760 543705 memory.go:184] no items to output this cycle
I0319 14:56:23.409796 543705 cpu.go:275] no items to output this cycle
E0319 14:56:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:56:33.409794 543705 memory.go:184] no items to output this cycle
I0319 14:56:33.409807 543705 cpu.go:275] no items to output this cycle
E0319 14:56:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:56:43.409792 543705 memory.go:191] Add success.
I0319 14:56:43.409799 543705 cpu.go:282] Add success.
I0319 14:56:43.419895 543705 net.go:648] Add success.
I0319 14:56:43.422650 543705 net.go:770] primary dev: ETH0
I0319 14:56:43.422665 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:56:43.422679 543705 net.go:698] Add success.
I0319 14:56:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:56:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:56:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:56:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:56:53.409765 543705 memory.go:184] no items to output this cycle
I0319 14:56:53.409788 543705 cpu.go:275] no items to output this cycle
E0319 14:57:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:57:03.409798 543705 memory.go:184] no items to output this cycle
I0319 14:57:03.409808 543705 cpu.go:275] no items to output this cycle
E0319 14:57:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:57:13.409826 543705 memory.go:191] Add success.
I0319 14:57:13.409833 543705 cpu.go:282] Add success.
W0319 14:57:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:57:13.409876 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:57:13.409879 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:57:13.420174 543705 net.go:648] Add success.
I0319 14:57:13.422969 543705 net.go:770] primary dev: ETH0
I0319 14:57:13.422984 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:57:13.422996 543705 net.go:698] Add success.
I0319 14:57:13.429408 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 14:57:13.453580 543705 event_worker.go:152] Polling the log file for events...
I0319 14:57:13.517413 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5ceafe8b-2017-4be6-8209-0a9e91bfaa01","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 14:57:13.517446 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 14:57:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:57:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0319 14:57:14.455184 543705 disk_worker.go:728] disk inode is not compliant
E0319 14:57:14.455897 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 14:57:14.455906 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 14:57:14.455911 543705 custom_config.go:64] query custom config with name: gpu
I0319 14:57:14.456535 543705 disk_worker.go:494] system disk:vda1
I0319 14:57:14.456565 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 14:57:15.456829 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 14:57:15.456837 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:57:16.457914 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 14:57:16.457914 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 14:57:16.457972 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:57:16.457992 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:57:16.472335 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:57:19.646869 543705 disk_info.go:125] begin check local disk info of client
I0319 14:57:19.649200 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:57:19.649206 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329e40 0xc000329e80]
E0319 14:57:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:57:23.409798 543705 memory.go:184] no items to output this cycle
I0319 14:57:23.409808 543705 cpu.go:275] no items to output this cycle
E0319 14:57:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:57:33.409784 543705 cpu.go:275] no items to output this cycle
I0319 14:57:33.409789 543705 memory.go:184] no items to output this cycle
I0319 14:57:37.744433 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 14:57:37.744439 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 14:57:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:57:43.410702 543705 memory.go:191] Add success.
I0319 14:57:43.409795 543705 cpu.go:282] Add success.
I0319 14:57:43.420393 543705 net.go:648] Add success.
I0319 14:57:43.422962 543705 net.go:770] primary dev: ETH0
I0319 14:57:43.422976 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:57:43.422989 543705 net.go:698] Add success.
I0319 14:57:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:57:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:57:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:57:53.410348 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:57:53.410365 543705 memory.go:184] no items to output this cycle
I0319 14:57:53.410397 543705 cpu.go:275] no items to output this cycle
E0319 14:58:03.409840 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:58:03.409862 543705 memory.go:184] no items to output this cycle
I0319 14:58:03.410014 543705 cpu.go:275] no items to output this cycle
E0319 14:58:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:58:13.409793 543705 memory.go:191] Add success.
W0319 14:58:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 14:58:13.409822 543705 cpu.go:282] Add success.
W0319 14:58:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:58:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:58:13.420146 543705 net.go:770] primary dev: ETH0
I0319 14:58:13.420160 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:58:13.420193 543705 net.go:698] Add success.
I0319 14:58:13.420543 543705 net.go:648] Add success.
I0319 14:58:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:58:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:58:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0319 14:58:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:58:14.456570 543705 disk_worker.go:494] system disk:vda1
I0319 14:58:14.456603 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:58:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:58:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:58:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:58:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:58:16.472393 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:58:19.649675 543705 disk_info.go:125] begin check local disk info of client
I0319 14:58:19.652018 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:58:19.652024 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa240 0xc0001aa280]
E0319 14:58:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:58:23.409795 543705 memory.go:184] no items to output this cycle
I0319 14:58:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 14:58:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:58:33.409775 543705 memory.go:184] no items to output this cycle
I0319 14:58:33.409796 543705 cpu.go:275] no items to output this cycle
E0319 14:58:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:58:43.409800 543705 memory.go:191] Add success.
I0319 14:58:43.409806 543705 cpu.go:282] Add success.
I0319 14:58:43.419883 543705 net.go:648] Add success.
I0319 14:58:43.422691 543705 net.go:770] primary dev: ETH0
I0319 14:58:43.422705 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:58:43.422720 543705 net.go:698] Add success.
I0319 14:58:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:58:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:58:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:58:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:58:53.409782 543705 memory.go:184] no items to output this cycle
I0319 14:58:53.409783 543705 cpu.go:275] no items to output this cycle
I0319 14:59:03.409902 543705 cpu.go:275] no items to output this cycle
E0319 14:59:03.409934 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:59:03.409969 543705 memory.go:184] no items to output this cycle
E0319 14:59:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:59:13.409799 543705 memory.go:191] Add success.
I0319 14:59:13.409810 543705 cpu.go:282] Add success.
W0319 14:59:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 14:59:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 14:59:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 14:59:13.420141 543705 net.go:648] Add success.
I0319 14:59:13.422759 543705 net.go:770] primary dev: ETH0
I0319 14:59:13.422772 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:59:13.422784 543705 net.go:698] Add success.
I0319 14:59:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 14:59:14.455154 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 14:59:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0319 14:59:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0319 14:59:14.456576 543705 disk_worker.go:494] system disk:vda1
I0319 14:59:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 14:59:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 14:59:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:59:16.458025 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:59:16.458046 543705 gpu_kunlun.go:227] Add success, len:1
I0319 14:59:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0319 14:59:19.652109 543705 disk_info.go:125] begin check local disk info of client
I0319 14:59:19.654534 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 14:59:19.654540 543705 disk_info.go:196] parse disk info done, disk is : [0xc000465680 0xc0004656c0]
E0319 14:59:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:59:23.409793 543705 memory.go:184] no items to output this cycle
I0319 14:59:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 14:59:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:59:33.409774 543705 memory.go:184] no items to output this cycle
I0319 14:59:33.409804 543705 cpu.go:275] no items to output this cycle
E0319 14:59:43.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:59:43.409830 543705 memory.go:191] Add success.
I0319 14:59:43.409832 543705 cpu.go:282] Add success.
I0319 14:59:43.419982 543705 net.go:648] Add success.
I0319 14:59:43.422821 543705 net.go:770] primary dev: ETH0
I0319 14:59:43.422835 543705 net.go:802] Send network stats successfully!,count is 6
I0319 14:59:43.422847 543705 net.go:698] Add success.
I0319 14:59:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 14:59:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 14:59:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 14:59:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 14:59:53.409764 543705 memory.go:184] no items to output this cycle
I0319 14:59:53.409804 543705 cpu.go:275] no items to output this cycle
E0319 15:00:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:00:03.409903 543705 cpu.go:275] no items to output this cycle
I0319 15:00:03.409907 543705 memory.go:184] no items to output this cycle
E0319 15:00:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:00:13.409827 543705 memory.go:191] Add success.
I0319 15:00:13.409831 543705 cpu.go:282] Add success.
W0319 15:00:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:00:13.409875 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:00:13.409880 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:00:13.420210 543705 net.go:648] Add success.
I0319 15:00:13.423203 543705 net.go:770] primary dev: ETH0
I0319 15:00:13.423221 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:00:13.423234 543705 net.go:698] Add success.
I0319 15:00:13.547107 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"dca45700-ffeb-4fff-ac84-efe033023136","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 15:00:13.547140 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 15:00:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:00:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:00:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0319 15:00:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:00:14.456580 543705 disk_worker.go:494] system disk:vda1
I0319 15:00:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:00:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:00:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:00:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:00:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:00:16.472388 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:00:19.654910 543705 disk_info.go:125] begin check local disk info of client
I0319 15:00:19.657316 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:00:19.657321 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004adcc0 0xc0004add00]
E0319 15:00:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:00:23.409786 543705 memory.go:184] no items to output this cycle
I0319 15:00:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 15:00:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:00:33.409792 543705 memory.go:184] no items to output this cycle
I0319 15:00:33.409794 543705 cpu.go:275] no items to output this cycle
I0319 15:00:37.744584 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 15:00:37.744592 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 15:00:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:00:43.410816 543705 memory.go:191] Add success.
I0319 15:00:43.409821 543705 cpu.go:282] Add success.
I0319 15:00:43.420521 543705 net.go:648] Add success.
I0319 15:00:43.423422 543705 net.go:770] primary dev: ETH0
I0319 15:00:43.423438 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:00:43.423453 543705 net.go:698] Add success.
I0319 15:00:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:00:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:00:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:00:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:00:53.409797 543705 memory.go:184] no items to output this cycle
I0319 15:00:53.409805 543705 cpu.go:275] no items to output this cycle
E0319 15:01:03.409871 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:01:03.409897 543705 memory.go:184] no items to output this cycle
I0319 15:01:03.409948 543705 cpu.go:275] no items to output this cycle
E0319 15:01:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:01:13.409830 543705 memory.go:191] Add success.
I0319 15:01:13.409843 543705 cpu.go:282] Add success.
W0319 15:01:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:01:13.409878 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:01:13.409882 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:01:13.420312 543705 net.go:648] Add success.
I0319 15:01:13.422926 543705 net.go:770] primary dev: ETH0
I0319 15:01:13.422941 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:01:13.422952 543705 net.go:698] Add success.
I0319 15:01:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:01:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:01:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 15:01:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:01:14.456578 543705 disk_worker.go:494] system disk:vda1
I0319 15:01:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:01:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:01:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:01:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:01:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:01:16.472402 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:01:19.657672 543705 disk_info.go:125] begin check local disk info of client
I0319 15:01:19.660059 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:01:19.660064 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab8c0 0xc0001ab900]
E0319 15:01:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:01:23.409797 543705 memory.go:184] no items to output this cycle
I0319 15:01:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 15:01:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:01:33.409790 543705 cpu.go:275] no items to output this cycle
I0319 15:01:33.409793 543705 memory.go:184] no items to output this cycle
E0319 15:01:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:01:43.409794 543705 memory.go:191] Add success.
I0319 15:01:43.409794 543705 cpu.go:282] Add success.
I0319 15:01:43.419859 543705 net.go:648] Add success.
I0319 15:01:43.422865 543705 net.go:770] primary dev: ETH0
I0319 15:01:43.422878 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:01:43.422890 543705 net.go:698] Add success.
I0319 15:01:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:01:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:01:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:01:53.410239 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:01:53.410254 543705 memory.go:184] no items to output this cycle
I0319 15:01:53.410280 543705 cpu.go:275] no items to output this cycle
E0319 15:02:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:02:03.409797 543705 memory.go:184] no items to output this cycle
I0319 15:02:03.409813 543705 cpu.go:275] no items to output this cycle
E0319 15:02:13.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:02:13.409828 543705 memory.go:191] Add success.
I0319 15:02:13.409835 543705 cpu.go:282] Add success.
W0319 15:02:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:02:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:02:13.409878 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:02:13.420154 543705 net.go:648] Add success.
I0319 15:02:13.423093 543705 net.go:770] primary dev: ETH0
I0319 15:02:13.423106 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:02:13.423118 543705 net.go:698] Add success.
W0319 15:02:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:02:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 15:02:14.455191 543705 disk_worker.go:728] disk inode is not compliant
E0319 15:02:14.455951 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 15:02:14.455960 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 15:02:14.455966 543705 custom_config.go:64] query custom config with name: gpu
I0319 15:02:14.456566 543705 disk_worker.go:494] system disk:vda1
I0319 15:02:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 15:02:15.456821 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 15:02:15.456831 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:02:16.457943 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 15:02:16.457952 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 15:02:16.457997 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:02:16.458014 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:02:16.472323 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:02:19.660943 543705 disk_info.go:125] begin check local disk info of client
I0319 15:02:19.663305 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:02:19.663311 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aaf00 0xc0001aaf40]
E0319 15:02:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:02:23.409782 543705 memory.go:184] no items to output this cycle
I0319 15:02:23.409784 543705 cpu.go:275] no items to output this cycle
E0319 15:02:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:02:33.409782 543705 memory.go:184] no items to output this cycle
I0319 15:02:33.409816 543705 cpu.go:275] no items to output this cycle
E0319 15:02:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:02:43.409825 543705 memory.go:191] Add success.
I0319 15:02:43.409835 543705 cpu.go:282] Add success.
I0319 15:02:43.420019 543705 net.go:648] Add success.
I0319 15:02:43.422589 543705 net.go:770] primary dev: ETH0
I0319 15:02:43.422610 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:02:43.422622 543705 net.go:698] Add success.
I0319 15:02:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:02:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:02:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:02:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:02:53.409786 543705 cpu.go:275] no items to output this cycle
I0319 15:02:53.409793 543705 memory.go:184] no items to output this cycle
E0319 15:03:03.409882 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:03:03.409901 543705 cpu.go:275] no items to output this cycle
I0319 15:03:03.409904 543705 memory.go:184] no items to output this cycle
E0319 15:03:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:03:13.409811 543705 memory.go:191] Add success.
I0319 15:03:13.409812 543705 cpu.go:282] Add success.
W0319 15:03:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:03:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:03:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:03:13.420156 543705 net.go:648] Add success.
I0319 15:03:13.423032 543705 net.go:770] primary dev: ETH0
I0319 15:03:13.423046 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:03:13.423058 543705 net.go:698] Add success.
I0319 15:03:13.468733 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"29cba4b6-19cd-4c90-8c97-8b92ffc10a5d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 15:03:13.468767 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 15:03:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:03:14.455160 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:03:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0319 15:03:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:03:14.456490 543705 disk_worker.go:494] system disk:vda1
I0319 15:03:14.456534 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:03:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:03:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:03:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:03:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:03:16.472397 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:03:19.663392 543705 disk_info.go:125] begin check local disk info of client
I0319 15:03:19.665818 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:03:19.665824 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007af40 0xc00007af80]
E0319 15:03:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:03:23.409795 543705 memory.go:184] no items to output this cycle
I0319 15:03:23.409808 543705 cpu.go:275] no items to output this cycle
E0319 15:03:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:03:33.409783 543705 memory.go:184] no items to output this cycle
I0319 15:03:33.409804 543705 cpu.go:275] no items to output this cycle
I0319 15:03:37.745427 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 15:03:37.745434 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 15:03:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:03:43.410611 543705 memory.go:191] Add success.
I0319 15:03:43.409817 543705 cpu.go:282] Add success.
I0319 15:03:43.420330 543705 net.go:648] Add success.
I0319 15:03:43.423053 543705 net.go:770] primary dev: ETH0
I0319 15:03:43.423066 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:03:43.423078 543705 net.go:698] Add success.
I0319 15:03:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:03:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:03:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:03:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:03:53.409805 543705 memory.go:184] no items to output this cycle
I0319 15:03:53.409818 543705 cpu.go:275] no items to output this cycle
E0319 15:04:03.409890 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:04:03.409907 543705 memory.go:184] no items to output this cycle
I0319 15:04:03.409954 543705 cpu.go:275] no items to output this cycle
E0319 15:04:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:04:13.409805 543705 memory.go:191] Add success.
I0319 15:04:13.409813 543705 cpu.go:282] Add success.
W0319 15:04:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:04:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:04:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:04:13.420172 543705 net.go:648] Add success.
I0319 15:04:13.422824 543705 net.go:770] primary dev: ETH0
I0319 15:04:13.422838 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:04:13.422850 543705 net.go:698] Add success.
I0319 15:04:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:04:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:04:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0319 15:04:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:04:14.456588 543705 disk_worker.go:494] system disk:vda1
I0319 15:04:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:04:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:04:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:04:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:04:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:04:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:04:19.666966 543705 disk_info.go:125] begin check local disk info of client
I0319 15:04:19.669305 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:04:19.669310 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab300 0xc0001ab400]
E0319 15:04:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:04:23.409785 543705 memory.go:184] no items to output this cycle
I0319 15:04:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 15:04:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:04:33.409783 543705 memory.go:184] no items to output this cycle
I0319 15:04:33.409787 543705 cpu.go:275] no items to output this cycle
E0319 15:04:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:04:43.409793 543705 memory.go:191] Add success.
I0319 15:04:43.409795 543705 cpu.go:282] Add success.
I0319 15:04:43.419975 543705 net.go:648] Add success.
I0319 15:04:43.422826 543705 net.go:770] primary dev: ETH0
I0319 15:04:43.422841 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:04:43.422855 543705 net.go:698] Add success.
I0319 15:04:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:04:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:04:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:04:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:04:53.409792 543705 memory.go:184] no items to output this cycle
I0319 15:04:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 15:05:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:05:03.409806 543705 memory.go:184] no items to output this cycle
I0319 15:05:03.409814 543705 cpu.go:275] no items to output this cycle
E0319 15:05:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:05:13.409825 543705 memory.go:191] Add success.
I0319 15:05:13.409827 543705 cpu.go:282] Add success.
W0319 15:05:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:05:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:05:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:05:13.420488 543705 net.go:648] Add success.
I0319 15:05:13.423130 543705 net.go:770] primary dev: ETH0
I0319 15:05:13.423145 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:05:13.423159 543705 net.go:698] Add success.
I0319 15:05:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:05:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:05:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 15:05:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:05:14.456574 543705 disk_worker.go:494] system disk:vda1
I0319 15:05:14.456602 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:05:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:05:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:05:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:05:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:05:16.472400 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:05:19.669672 543705 disk_info.go:125] begin check local disk info of client
I0319 15:05:19.672059 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:05:19.672067 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001abf40 0xc0000c4100]
E0319 15:05:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:05:23.409787 543705 memory.go:184] no items to output this cycle
I0319 15:05:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 15:05:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:05:33.409784 543705 memory.go:184] no items to output this cycle
I0319 15:05:33.409784 543705 cpu.go:275] no items to output this cycle
E0319 15:05:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:05:43.409814 543705 memory.go:191] Add success.
I0319 15:05:43.409821 543705 cpu.go:282] Add success.
I0319 15:05:43.419945 543705 net.go:648] Add success.
I0319 15:05:43.422944 543705 net.go:770] primary dev: ETH0
I0319 15:05:43.422958 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:05:43.422971 543705 net.go:698] Add success.
I0319 15:05:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:05:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:05:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:05:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:05:53.409772 543705 memory.go:184] no items to output this cycle
I0319 15:05:53.409792 543705 cpu.go:275] no items to output this cycle
E0319 15:06:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:06:03.409805 543705 memory.go:184] no items to output this cycle
I0319 15:06:03.409814 543705 cpu.go:275] no items to output this cycle
E0319 15:06:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:06:13.409797 543705 memory.go:191] Add success.
W0319 15:06:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 15:06:13.409827 543705 cpu.go:282] Add success.
W0319 15:06:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:06:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:06:13.420165 543705 net.go:648] Add success.
I0319 15:06:13.423184 543705 net.go:770] primary dev: ETH0
I0319 15:06:13.423200 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:06:13.423212 543705 net.go:698] Add success.
I0319 15:06:13.468319 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"26d5ca81-87f5-41cc-8f68-7ac773a5f75d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 15:06:13.468353 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 15:06:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:06:14.455099 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:06:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0319 15:06:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:06:14.456572 543705 disk_worker.go:494] system disk:vda1
I0319 15:06:14.456616 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:06:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:06:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:06:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:06:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:06:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:06:19.673004 543705 disk_info.go:125] begin check local disk info of client
I0319 15:06:19.675556 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:06:19.675562 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001abd40 0xc0001abd80]
E0319 15:06:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:06:23.409775 543705 memory.go:184] no items to output this cycle
I0319 15:06:23.409791 543705 cpu.go:275] no items to output this cycle
E0319 15:06:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:06:33.409774 543705 memory.go:184] no items to output this cycle
I0319 15:06:33.409793 543705 cpu.go:275] no items to output this cycle
I0319 15:06:37.745739 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 15:06:37.745747 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 15:06:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:06:43.410752 543705 memory.go:191] Add success.
I0319 15:06:43.409800 543705 cpu.go:282] Add success.
I0319 15:06:43.420438 543705 net.go:648] Add success.
I0319 15:06:43.423371 543705 net.go:770] primary dev: ETH0
I0319 15:06:43.423384 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:06:43.423397 543705 net.go:698] Add success.
I0319 15:06:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:06:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:06:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:06:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:06:53.409784 543705 cpu.go:275] no items to output this cycle
I0319 15:06:53.409793 543705 memory.go:184] no items to output this cycle
E0319 15:07:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:07:03.409783 543705 memory.go:184] no items to output this cycle
I0319 15:07:03.409786 543705 cpu.go:275] no items to output this cycle
E0319 15:07:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:07:13.409786 543705 memory.go:191] Add success.
I0319 15:07:13.409795 543705 cpu.go:282] Add success.
W0319 15:07:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:07:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:07:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:07:13.420184 543705 net.go:648] Add success.
I0319 15:07:13.421180 543705 net.go:770] primary dev: ETH0
I0319 15:07:13.421194 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:07:13.421220 543705 net.go:698] Add success.
I0319 15:07:13.452771 543705 event_worker.go:152] Polling the log file for events...
W0319 15:07:14.455128 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:07:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 15:07:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:07:14.456805 543705 disk_worker.go:494] system disk:vda1
I0319 15:07:14.456846 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 15:07:14.457135 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 15:07:14.457143 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 15:07:14.457148 543705 custom_config.go:64] query custom config with name: gpu
E0319 15:07:15.456875 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 15:07:15.456884 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:07:16.457960 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 15:07:16.457969 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 15:07:16.458013 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:07:16.458030 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:07:16.472400 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:07:19.676018 543705 disk_info.go:125] begin check local disk info of client
I0319 15:07:19.678414 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:07:19.678420 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b6300 0xc0003b6340]
E0319 15:07:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:07:23.409796 543705 memory.go:184] no items to output this cycle
I0319 15:07:23.409808 543705 cpu.go:275] no items to output this cycle
E0319 15:07:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:07:33.409779 543705 memory.go:184] no items to output this cycle
I0319 15:07:33.409779 543705 cpu.go:275] no items to output this cycle
E0319 15:07:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:07:43.409794 543705 memory.go:191] Add success.
I0319 15:07:43.409797 543705 cpu.go:282] Add success.
I0319 15:07:43.419881 543705 net.go:648] Add success.
I0319 15:07:43.422536 543705 net.go:770] primary dev: ETH0
I0319 15:07:43.422550 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:07:43.422564 543705 net.go:698] Add success.
I0319 15:07:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:07:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:07:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:07:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:07:53.409793 543705 memory.go:184] no items to output this cycle
I0319 15:07:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 15:08:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:08:03.409769 543705 memory.go:184] no items to output this cycle
I0319 15:08:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 15:08:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:08:13.409794 543705 memory.go:191] Add success.
I0319 15:08:13.409812 543705 cpu.go:282] Add success.
W0319 15:08:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:08:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:08:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:08:13.420226 543705 net.go:648] Add success.
I0319 15:08:13.423226 543705 net.go:770] primary dev: ETH0
I0319 15:08:13.423241 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:08:13.423256 543705 net.go:698] Add success.
I0319 15:08:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:08:14.455169 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:08:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0319 15:08:14.455182 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:08:14.456580 543705 disk_worker.go:494] system disk:vda1
I0319 15:08:14.456612 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:08:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:08:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:08:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:08:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:08:16.472362 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:08:19.679089 543705 disk_info.go:125] begin check local disk info of client
I0319 15:08:19.681531 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:08:19.681537 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b9580 0xc0003b95c0]
E0319 15:08:23.410228 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:08:23.410249 543705 memory.go:184] no items to output this cycle
I0319 15:08:23.410264 543705 cpu.go:275] no items to output this cycle
E0319 15:08:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:08:33.409775 543705 memory.go:184] no items to output this cycle
I0319 15:08:33.409792 543705 cpu.go:275] no items to output this cycle
E0319 15:08:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:08:43.409822 543705 memory.go:191] Add success.
I0319 15:08:43.409828 543705 cpu.go:282] Add success.
I0319 15:08:43.419885 543705 net.go:648] Add success.
I0319 15:08:43.422698 543705 net.go:770] primary dev: ETH0
I0319 15:08:43.422713 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:08:43.422728 543705 net.go:698] Add success.
I0319 15:08:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:08:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:08:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:08:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:08:53.409793 543705 memory.go:184] no items to output this cycle
I0319 15:08:53.409804 543705 cpu.go:275] no items to output this cycle
E0319 15:09:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:09:03.409770 543705 memory.go:184] no items to output this cycle
I0319 15:09:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 15:09:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:09:13.409827 543705 memory.go:191] Add success.
I0319 15:09:13.409834 543705 cpu.go:282] Add success.
W0319 15:09:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:09:13.409876 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:09:13.409879 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:09:13.420130 543705 net.go:648] Add success.
I0319 15:09:13.422876 543705 net.go:770] primary dev: ETH0
I0319 15:09:13.422889 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:09:13.422901 543705 net.go:698] Add success.
I0319 15:09:13.469415 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"eccc735f-e79b-45e3-b1c8-f1e47ea598f7","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 15:09:13.469459 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 15:09:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:09:14.455111 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:09:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0319 15:09:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:09:14.456591 543705 disk_worker.go:494] system disk:vda1
I0319 15:09:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:09:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:09:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:09:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:09:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:09:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:09:19.681671 543705 disk_info.go:125] begin check local disk info of client
I0319 15:09:19.684069 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:09:19.684075 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047bac0 0xc00047bb00]
E0319 15:09:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:09:23.409795 543705 memory.go:184] no items to output this cycle
I0319 15:09:23.409806 543705 cpu.go:275] no items to output this cycle
E0319 15:09:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:09:33.409784 543705 memory.go:184] no items to output this cycle
I0319 15:09:33.409785 543705 cpu.go:275] no items to output this cycle
I0319 15:09:37.747451 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 15:09:37.747458 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 15:09:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:09:43.410743 543705 memory.go:191] Add success.
I0319 15:09:43.409811 543705 cpu.go:282] Add success.
I0319 15:09:43.420518 543705 net.go:648] Add success.
I0319 15:09:43.423411 543705 net.go:770] primary dev: ETH0
I0319 15:09:43.423427 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:09:43.423455 543705 net.go:698] Add success.
I0319 15:09:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:09:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:09:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:09:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:09:53.409771 543705 memory.go:184] no items to output this cycle
I0319 15:09:53.409788 543705 cpu.go:275] no items to output this cycle
E0319 15:10:03.409864 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:10:03.409885 543705 memory.go:184] no items to output this cycle
I0319 15:10:03.409959 543705 cpu.go:275] no items to output this cycle
E0319 15:10:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:10:13.409803 543705 memory.go:191] Add success.
I0319 15:10:13.409804 543705 cpu.go:282] Add success.
W0319 15:10:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:10:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:10:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:10:13.420182 543705 net.go:648] Add success.
I0319 15:10:13.423026 543705 net.go:770] primary dev: ETH0
I0319 15:10:13.423042 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:10:13.423054 543705 net.go:698] Add success.
I0319 15:10:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:10:14.455107 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:10:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0319 15:10:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:10:14.456582 543705 disk_worker.go:494] system disk:vda1
I0319 15:10:14.456612 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:10:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:10:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:10:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:10:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:10:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:10:19.685125 543705 disk_info.go:125] begin check local disk info of client
I0319 15:10:19.687529 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:10:19.687534 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003288c0 0xc000328900]
E0319 15:10:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:10:23.409781 543705 memory.go:184] no items to output this cycle
I0319 15:10:23.409780 543705 cpu.go:275] no items to output this cycle
E0319 15:10:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:10:33.409774 543705 memory.go:184] no items to output this cycle
I0319 15:10:33.409797 543705 cpu.go:275] no items to output this cycle
E0319 15:10:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:10:43.409824 543705 memory.go:191] Add success.
I0319 15:10:43.409833 543705 cpu.go:282] Add success.
I0319 15:10:43.420251 543705 net.go:648] Add success.
I0319 15:10:43.423092 543705 net.go:770] primary dev: ETH0
I0319 15:10:43.423107 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:10:43.423123 543705 net.go:698] Add success.
I0319 15:10:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:10:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:10:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:10:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:10:53.409768 543705 memory.go:184] no items to output this cycle
I0319 15:10:53.409804 543705 cpu.go:275] no items to output this cycle
E0319 15:11:03.409900 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:11:03.409927 543705 memory.go:184] no items to output this cycle
I0319 15:11:03.409978 543705 cpu.go:275] no items to output this cycle
E0319 15:11:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:11:13.409785 543705 memory.go:191] Add success.
W0319 15:11:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 15:11:13.409816 543705 cpu.go:282] Add success.
W0319 15:11:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:11:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:11:13.420192 543705 net.go:648] Add success.
I0319 15:11:13.423181 543705 net.go:770] primary dev: ETH0
I0319 15:11:13.423196 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:11:13.423210 543705 net.go:698] Add success.
I0319 15:11:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:11:14.455162 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:11:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0319 15:11:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:11:14.456509 543705 disk_worker.go:494] system disk:vda1
I0319 15:11:14.456555 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:11:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:11:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:11:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:11:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:11:16.472362 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:11:19.687628 543705 disk_info.go:125] begin check local disk info of client
I0319 15:11:19.689908 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:11:19.689914 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047ddc0 0xc00047de00]
E0319 15:11:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:11:23.409781 543705 cpu.go:275] no items to output this cycle
I0319 15:11:23.409782 543705 memory.go:184] no items to output this cycle
E0319 15:11:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:11:33.409764 543705 memory.go:184] no items to output this cycle
I0319 15:11:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 15:11:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:11:43.409800 543705 memory.go:191] Add success.
I0319 15:11:43.409811 543705 cpu.go:282] Add success.
I0319 15:11:43.419895 543705 net.go:648] Add success.
I0319 15:11:43.422663 543705 net.go:770] primary dev: ETH0
I0319 15:11:43.422682 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:11:43.422702 543705 net.go:698] Add success.
I0319 15:11:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:11:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:11:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:11:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:11:53.409764 543705 memory.go:184] no items to output this cycle
I0319 15:11:53.409798 543705 cpu.go:275] no items to output this cycle
E0319 15:12:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:12:03.409798 543705 memory.go:184] no items to output this cycle
I0319 15:12:03.409813 543705 cpu.go:275] no items to output this cycle
E0319 15:12:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:12:13.409812 543705 memory.go:191] Add success.
I0319 15:12:13.409814 543705 cpu.go:282] Add success.
W0319 15:12:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:12:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:12:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:12:13.420178 543705 net.go:648] Add success.
I0319 15:12:13.423064 543705 net.go:770] primary dev: ETH0
I0319 15:12:13.423077 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:12:13.423089 543705 net.go:698] Add success.
I0319 15:12:13.464480 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"185cc2e3-2941-4e8c-8578-97797f7de880","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 15:12:13.464514 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 15:12:14.455230 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:12:14.455248 543705 disk_worker.go:708] disk space is not compliant
W0319 15:12:14.455253 543705 disk_worker.go:728] disk inode is not compliant
E0319 15:12:14.455927 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 15:12:14.455937 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 15:12:14.455942 543705 custom_config.go:64] query custom config with name: gpu
I0319 15:12:14.456850 543705 disk_worker.go:494] system disk:vda1
I0319 15:12:14.456907 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 15:12:15.456903 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 15:12:15.456913 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:12:16.457940 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 15:12:16.457949 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 15:12:16.457995 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:12:16.458010 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:12:16.472350 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:12:19.689999 543705 disk_info.go:125] begin check local disk info of client
I0319 15:12:19.692409 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:12:19.692416 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab880 0xc0001ab8c0]
E0319 15:12:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:12:23.409800 543705 memory.go:184] no items to output this cycle
I0319 15:12:23.409813 543705 cpu.go:275] no items to output this cycle
E0319 15:12:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:12:33.409781 543705 memory.go:184] no items to output this cycle
I0319 15:12:33.409799 543705 cpu.go:275] no items to output this cycle
I0319 15:12:37.747612 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 15:12:37.747620 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 15:12:43.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:12:43.410779 543705 memory.go:191] Add success.
I0319 15:12:43.409840 543705 cpu.go:282] Add success.
I0319 15:12:43.420485 543705 net.go:648] Add success.
I0319 15:12:43.423382 543705 net.go:770] primary dev: ETH0
I0319 15:12:43.423399 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:12:43.423415 543705 net.go:698] Add success.
I0319 15:12:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:12:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:12:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:12:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:12:53.409770 543705 memory.go:184] no items to output this cycle
I0319 15:12:53.409797 543705 cpu.go:275] no items to output this cycle
E0319 15:13:03.409874 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:13:03.409893 543705 memory.go:184] no items to output this cycle
I0319 15:13:03.410010 543705 cpu.go:275] no items to output this cycle
E0319 15:13:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:13:13.409805 543705 memory.go:191] Add success.
I0319 15:13:13.409815 543705 cpu.go:282] Add success.
W0319 15:13:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:13:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:13:13.409851 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:13:13.420206 543705 net.go:648] Add success.
I0319 15:13:13.422878 543705 net.go:770] primary dev: ETH0
I0319 15:13:13.422891 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:13:13.422903 543705 net.go:698] Add success.
I0319 15:13:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:13:14.455102 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:13:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0319 15:13:14.455167 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:13:14.456481 543705 disk_worker.go:494] system disk:vda1
I0319 15:13:14.456527 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:13:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:13:16.458007 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:13:16.458068 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:13:16.458092 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:13:16.472407 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:13:19.692505 543705 disk_info.go:125] begin check local disk info of client
I0319 15:13:19.695020 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:13:19.695026 543705 disk_info.go:196] parse disk info done, disk is : [0xc000357240 0xc000357280]
E0319 15:13:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:13:23.409805 543705 memory.go:184] no items to output this cycle
I0319 15:13:23.409816 543705 cpu.go:275] no items to output this cycle
E0319 15:13:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:13:33.409809 543705 memory.go:184] no items to output this cycle
I0319 15:13:33.409820 543705 cpu.go:275] no items to output this cycle
E0319 15:13:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:13:43.409790 543705 memory.go:191] Add success.
I0319 15:13:43.409798 543705 cpu.go:282] Add success.
I0319 15:13:43.419866 543705 net.go:648] Add success.
I0319 15:13:43.422525 543705 net.go:770] primary dev: ETH0
I0319 15:13:43.422538 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:13:43.422559 543705 net.go:698] Add success.
I0319 15:13:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:13:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:13:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:13:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:13:53.409767 543705 memory.go:184] no items to output this cycle
I0319 15:13:53.409800 543705 cpu.go:275] no items to output this cycle
E0319 15:14:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:14:03.409767 543705 memory.go:184] no items to output this cycle
I0319 15:14:03.409791 543705 cpu.go:275] no items to output this cycle
E0319 15:14:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:14:13.409809 543705 memory.go:191] Add success.
I0319 15:14:13.409824 543705 cpu.go:282] Add success.
W0319 15:14:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:14:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:14:13.409851 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:14:13.420163 543705 net.go:648] Add success.
I0319 15:14:13.422932 543705 net.go:770] primary dev: ETH0
I0319 15:14:13.422946 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:14:13.422957 543705 net.go:698] Add success.
I0319 15:14:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:14:14.455199 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:14:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0319 15:14:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:14:14.456577 543705 disk_worker.go:494] system disk:vda1
I0319 15:14:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:14:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:14:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:14:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:14:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:14:16.472377 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:14:19.695115 543705 disk_info.go:125] begin check local disk info of client
I0319 15:14:19.697574 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:14:19.697581 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004adf00 0xc0004adf40]
E0319 15:14:23.410418 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:14:23.410434 543705 memory.go:184] no items to output this cycle
I0319 15:14:23.410436 543705 cpu.go:275] no items to output this cycle
E0319 15:14:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:14:33.409798 543705 memory.go:184] no items to output this cycle
I0319 15:14:33.409814 543705 cpu.go:275] no items to output this cycle
E0319 15:14:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:14:43.409795 543705 memory.go:191] Add success.
I0319 15:14:43.409800 543705 cpu.go:282] Add success.
I0319 15:14:43.419967 543705 net.go:648] Add success.
I0319 15:14:43.422786 543705 net.go:770] primary dev: ETH0
I0319 15:14:43.422802 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:14:43.422822 543705 net.go:698] Add success.
I0319 15:14:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:14:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:14:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:14:53.410392 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:14:53.410411 543705 memory.go:184] no items to output this cycle
I0319 15:14:53.410421 543705 cpu.go:275] no items to output this cycle
E0319 15:15:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:15:03.409766 543705 memory.go:184] no items to output this cycle
I0319 15:15:03.409809 543705 cpu.go:275] no items to output this cycle
E0319 15:15:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:15:13.409920 543705 memory.go:191] Add success.
W0319 15:15:13.409967 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 15:15:13.410026 543705 cpu.go:282] Add success.
W0319 15:15:13.410097 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:15:13.410101 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:15:13.419757 543705 net.go:648] Add success.
I0319 15:15:13.422669 543705 net.go:770] primary dev: ETH0
I0319 15:15:13.422684 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:15:13.422696 543705 net.go:698] Add success.
I0319 15:15:13.468629 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d6903129-a4de-4a3f-bac2-93fd9fc3d319","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 15:15:13.468662 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 15:15:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:15:14.455132 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:15:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0319 15:15:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:15:14.456671 543705 disk_worker.go:494] system disk:vda1
I0319 15:15:14.456700 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:15:15.455611 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:15:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:15:16.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:15:16.458085 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:15:16.472450 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:15:19.697671 543705 disk_info.go:125] begin check local disk info of client
I0319 15:15:19.700138 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:15:19.700144 543705 disk_info.go:196] parse disk info done, disk is : [0xc000328440 0xc000328480]
E0319 15:15:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:15:23.409795 543705 memory.go:184] no items to output this cycle
I0319 15:15:23.409808 543705 cpu.go:275] no items to output this cycle
E0319 15:15:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:15:33.409800 543705 memory.go:184] no items to output this cycle
I0319 15:15:33.409813 543705 cpu.go:275] no items to output this cycle
I0319 15:15:37.748431 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 15:15:37.748438 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 15:15:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:15:43.410871 543705 memory.go:191] Add success.
I0319 15:15:43.409861 543705 cpu.go:282] Add success.
I0319 15:15:43.420678 543705 net.go:648] Add success.
I0319 15:15:43.423622 543705 net.go:770] primary dev: ETH0
I0319 15:15:43.423637 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:15:43.423651 543705 net.go:698] Add success.
I0319 15:15:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:15:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:15:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:15:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:15:53.409797 543705 memory.go:184] no items to output this cycle
I0319 15:15:53.409807 543705 cpu.go:275] no items to output this cycle
E0319 15:16:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:16:03.409772 543705 memory.go:184] no items to output this cycle
I0319 15:16:03.409794 543705 cpu.go:275] no items to output this cycle
E0319 15:16:13.409878 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:16:13.409897 543705 cpu.go:282] Add success.
I0319 15:16:13.409903 543705 memory.go:191] Add success.
W0319 15:16:13.409939 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:16:13.409957 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:16:13.409967 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:16:13.419713 543705 net.go:648] Add success.
I0319 15:16:13.422627 543705 net.go:770] primary dev: ETH0
I0319 15:16:13.422642 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:16:13.422656 543705 net.go:698] Add success.
I0319 15:16:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:16:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:16:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0319 15:16:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:16:14.456587 543705 disk_worker.go:494] system disk:vda1
I0319 15:16:14.456618 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:16:15.455976 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:16:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:16:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:16:16.458081 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:16:16.472451 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:16:19.701145 543705 disk_info.go:125] begin check local disk info of client
I0319 15:16:19.703697 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:16:19.703704 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004ac380 0xc0004ac3c0]
E0319 15:16:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:16:23.409795 543705 memory.go:184] no items to output this cycle
I0319 15:16:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 15:16:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:16:33.409795 543705 memory.go:184] no items to output this cycle
I0319 15:16:33.409808 543705 cpu.go:275] no items to output this cycle
E0319 15:16:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:16:43.409789 543705 memory.go:191] Add success.
I0319 15:16:43.409817 543705 cpu.go:282] Add success.
I0319 15:16:43.419881 543705 net.go:648] Add success.
I0319 15:16:43.422825 543705 net.go:770] primary dev: ETH0
I0319 15:16:43.422840 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:16:43.422851 543705 net.go:698] Add success.
I0319 15:16:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:16:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:16:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:16:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:16:53.409775 543705 memory.go:184] no items to output this cycle
I0319 15:16:53.409780 543705 cpu.go:275] no items to output this cycle
E0319 15:17:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:17:03.409796 543705 memory.go:184] no items to output this cycle
I0319 15:17:03.409809 543705 cpu.go:275] no items to output this cycle
E0319 15:17:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:17:13.409800 543705 memory.go:191] Add success.
I0319 15:17:13.409805 543705 cpu.go:282] Add success.
W0319 15:17:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:17:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:17:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:17:13.420164 543705 net.go:648] Add success.
I0319 15:17:13.423181 543705 net.go:770] primary dev: ETH0
I0319 15:17:13.423196 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:17:13.423210 543705 net.go:698] Add success.
I0319 15:17:13.452788 543705 event_worker.go:152] Polling the log file for events...
W0319 15:17:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:17:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0319 15:17:14.455201 543705 disk_worker.go:728] disk inode is not compliant
E0319 15:17:14.457012 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 15:17:14.457022 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 15:17:14.457029 543705 custom_config.go:64] query custom config with name: gpu
I0319 15:17:14.457049 543705 disk_worker.go:494] system disk:vda1
I0319 15:17:14.457092 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 15:17:15.456816 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 15:17:15.456827 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:17:16.457948 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 15:17:16.457948 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 15:17:16.458002 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:17:16.458022 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:17:16.472364 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:17:19.705164 543705 disk_info.go:125] begin check local disk info of client
I0319 15:17:19.707539 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:17:19.707545 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004ca800 0xc0004ca840]
E0319 15:17:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:17:23.409769 543705 memory.go:184] no items to output this cycle
I0319 15:17:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 15:17:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:17:33.409799 543705 memory.go:184] no items to output this cycle
I0319 15:17:33.409812 543705 cpu.go:275] no items to output this cycle
E0319 15:17:43.409816 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:17:43.409848 543705 memory.go:191] Add success.
I0319 15:17:43.409932 543705 cpu.go:282] Add success.
I0319 15:17:43.420238 543705 net.go:648] Add success.
I0319 15:17:43.423325 543705 net.go:770] primary dev: ETH0
I0319 15:17:43.423339 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:17:43.423353 543705 net.go:698] Add success.
I0319 15:17:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:17:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:17:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:17:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:17:53.409769 543705 memory.go:184] no items to output this cycle
I0319 15:17:53.409798 543705 cpu.go:275] no items to output this cycle
E0319 15:18:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:18:03.409767 543705 memory.go:184] no items to output this cycle
I0319 15:18:03.409797 543705 cpu.go:275] no items to output this cycle
E0319 15:18:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:18:13.409804 543705 memory.go:191] Add success.
I0319 15:18:13.409806 543705 cpu.go:282] Add success.
W0319 15:18:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:18:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:18:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:18:13.420276 543705 net.go:648] Add success.
I0319 15:18:13.422953 543705 net.go:770] primary dev: ETH0
I0319 15:18:13.422968 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:18:13.422981 543705 net.go:698] Add success.
I0319 15:18:13.470138 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"348f6d04-d646-4476-8b3d-d26557d6f826","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 15:18:13.470172 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 15:18:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:18:14.455335 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:18:14.455346 543705 disk_worker.go:708] disk space is not compliant
W0319 15:18:14.455349 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:18:14.456777 543705 disk_worker.go:494] system disk:vda1
I0319 15:18:14.456820 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:18:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:18:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:18:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:18:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:18:16.472407 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:18:19.707626 543705 disk_info.go:125] begin check local disk info of client
I0319 15:18:19.710085 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:18:19.710091 543705 disk_info.go:196] parse disk info done, disk is : [0xc00029a4c0 0xc00029a500]
E0319 15:18:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:18:23.409765 543705 memory.go:184] no items to output this cycle
I0319 15:18:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 15:18:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:18:33.409797 543705 memory.go:184] no items to output this cycle
I0319 15:18:33.409807 543705 cpu.go:275] no items to output this cycle
I0319 15:18:37.749440 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 15:18:37.749448 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 15:18:43.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:18:43.410603 543705 memory.go:191] Add success.
I0319 15:18:43.409839 543705 cpu.go:282] Add success.
I0319 15:18:43.420384 543705 net.go:648] Add success.
I0319 15:18:43.423246 543705 net.go:770] primary dev: ETH0
I0319 15:18:43.423262 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:18:43.423276 543705 net.go:698] Add success.
I0319 15:18:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:18:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:18:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:18:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:18:53.409796 543705 memory.go:184] no items to output this cycle
I0319 15:18:53.409806 543705 cpu.go:275] no items to output this cycle
E0319 15:19:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:19:03.409781 543705 memory.go:184] no items to output this cycle
I0319 15:19:03.409781 543705 cpu.go:275] no items to output this cycle
E0319 15:19:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:19:13.409805 543705 memory.go:191] Add success.
I0319 15:19:13.409809 543705 cpu.go:282] Add success.
W0319 15:19:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:19:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:19:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:19:13.420189 543705 net.go:648] Add success.
I0319 15:19:13.423219 543705 net.go:770] primary dev: ETH0
I0319 15:19:13.423232 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:19:13.423244 543705 net.go:698] Add success.
I0319 15:19:14.454978 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:19:14.455128 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:19:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 15:19:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:19:14.457232 543705 disk_worker.go:494] system disk:vda1
I0319 15:19:14.457260 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:19:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:19:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:19:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:19:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:19:16.472429 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:19:19.711193 543705 disk_info.go:125] begin check local disk info of client
I0319 15:19:19.713617 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:19:19.713623 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a2180 0xc0004a21c0]
E0319 15:19:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:19:23.409769 543705 memory.go:184] no items to output this cycle
I0319 15:19:23.409789 543705 cpu.go:275] no items to output this cycle
E0319 15:19:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:19:33.409797 543705 memory.go:184] no items to output this cycle
I0319 15:19:33.409809 543705 cpu.go:275] no items to output this cycle
E0319 15:19:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:19:43.409780 543705 memory.go:191] Add success.
I0319 15:19:43.409802 543705 cpu.go:282] Add success.
I0319 15:19:43.419978 543705 net.go:648] Add success.
I0319 15:19:43.422934 543705 net.go:770] primary dev: ETH0
I0319 15:19:43.422947 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:19:43.422960 543705 net.go:698] Add success.
I0319 15:19:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:19:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:19:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:19:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:19:53.409798 543705 memory.go:184] no items to output this cycle
I0319 15:19:53.409807 543705 cpu.go:275] no items to output this cycle
E0319 15:20:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:20:03.409795 543705 memory.go:184] no items to output this cycle
I0319 15:20:03.409806 543705 cpu.go:275] no items to output this cycle
E0319 15:20:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:20:13.409796 543705 memory.go:191] Add success.
I0319 15:20:13.409814 543705 cpu.go:282] Add success.
W0319 15:20:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:20:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:20:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:20:13.420109 543705 net.go:648] Add success.
I0319 15:20:13.423317 543705 net.go:770] primary dev: ETH0
I0319 15:20:13.423330 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:20:13.423341 543705 net.go:698] Add success.
I0319 15:20:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:20:14.455168 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:20:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0319 15:20:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:20:14.456567 543705 disk_worker.go:494] system disk:vda1
I0319 15:20:14.456598 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:20:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:20:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:20:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:20:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:20:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:20:19.713684 543705 disk_info.go:125] begin check local disk info of client
I0319 15:20:19.716162 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:20:19.716168 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037e0c0 0xc00037e100]
E0319 15:20:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:20:23.409768 543705 memory.go:184] no items to output this cycle
I0319 15:20:23.409791 543705 cpu.go:275] no items to output this cycle
E0319 15:20:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:20:33.409766 543705 memory.go:184] no items to output this cycle
I0319 15:20:33.409787 543705 cpu.go:275] no items to output this cycle
E0319 15:20:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:20:43.409828 543705 memory.go:191] Add success.
I0319 15:20:43.409831 543705 cpu.go:282] Add success.
I0319 15:20:43.420039 543705 net.go:648] Add success.
I0319 15:20:43.423107 543705 net.go:770] primary dev: ETH0
I0319 15:20:43.423122 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:20:43.423136 543705 net.go:698] Add success.
I0319 15:20:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:20:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:20:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:20:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:20:53.409778 543705 memory.go:184] no items to output this cycle
I0319 15:20:53.409780 543705 cpu.go:275] no items to output this cycle
E0319 15:21:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:21:03.409802 543705 memory.go:184] no items to output this cycle
I0319 15:21:03.409807 543705 cpu.go:275] no items to output this cycle
E0319 15:21:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:21:13.409822 543705 memory.go:191] Add success.
I0319 15:21:13.409833 543705 cpu.go:282] Add success.
W0319 15:21:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:21:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:21:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:21:13.420123 543705 net.go:648] Add success.
I0319 15:21:13.422866 543705 net.go:770] primary dev: ETH0
I0319 15:21:13.422879 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:21:13.422891 543705 net.go:698] Add success.
I0319 15:21:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:21:14.455197 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:21:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0319 15:21:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:21:14.456588 543705 disk_worker.go:494] system disk:vda1
I0319 15:21:14.456617 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:21:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:21:16.188164 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5b7f72eb-5269-4ac3-bce3-72715cfbcfcb","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 15:21:16.188202 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 15:21:16.457639 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:21:16.457707 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:21:16.457735 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:21:16.473046 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:21:19.717282 543705 disk_info.go:125] begin check local disk info of client
I0319 15:21:19.719712 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:21:19.719719 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039a400 0xc00039a440]
E0319 15:21:23.410501 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:21:23.410520 543705 memory.go:184] no items to output this cycle
I0319 15:21:23.410530 543705 cpu.go:275] no items to output this cycle
E0319 15:21:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:21:33.409777 543705 memory.go:184] no items to output this cycle
I0319 15:21:33.409782 543705 cpu.go:275] no items to output this cycle
I0319 15:21:37.749731 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 15:21:37.749738 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 15:21:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:21:43.410639 543705 memory.go:191] Add success.
I0319 15:21:43.409828 543705 cpu.go:282] Add success.
I0319 15:21:43.420379 543705 net.go:648] Add success.
I0319 15:21:43.423048 543705 net.go:770] primary dev: ETH0
I0319 15:21:43.423061 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:21:43.423074 543705 net.go:698] Add success.
I0319 15:21:46.458496 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:21:46.458569 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:21:46.458603 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:21:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:21:53.409773 543705 memory.go:184] no items to output this cycle
I0319 15:21:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 15:22:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:22:03.409772 543705 memory.go:184] no items to output this cycle
I0319 15:22:03.409794 543705 cpu.go:275] no items to output this cycle
E0319 15:22:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:22:13.409794 543705 memory.go:191] Add success.
I0319 15:22:13.409811 543705 cpu.go:282] Add success.
W0319 15:22:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:22:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:22:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:22:13.420282 543705 net.go:648] Add success.
I0319 15:22:13.422983 543705 net.go:770] primary dev: ETH0
I0319 15:22:13.422997 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:22:13.423009 543705 net.go:698] Add success.
W0319 15:22:14.455232 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:22:14.455246 543705 disk_worker.go:708] disk space is not compliant
W0319 15:22:14.455250 543705 disk_worker.go:728] disk inode is not compliant
E0319 15:22:14.455901 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 15:22:14.455910 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 15:22:14.455916 543705 custom_config.go:64] query custom config with name: gpu
I0319 15:22:14.456846 543705 disk_worker.go:494] system disk:vda1
I0319 15:22:14.456891 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 15:22:15.456820 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 15:22:15.456827 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:22:16.457912 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 15:22:16.457910 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 15:22:16.457965 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:22:16.457984 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:22:16.472298 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:22:19.721235 543705 disk_info.go:125] begin check local disk info of client
I0319 15:22:19.723686 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:22:19.723691 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003bed80 0xc0003bedc0]
I0319 15:22:23.409942 543705 cpu.go:275] no items to output this cycle
E0319 15:22:23.409949 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:22:23.410025 543705 memory.go:184] no items to output this cycle
E0319 15:22:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:22:33.409792 543705 memory.go:184] no items to output this cycle
I0319 15:22:33.409801 543705 cpu.go:275] no items to output this cycle
E0319 15:22:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:22:43.409801 543705 memory.go:191] Add success.
I0319 15:22:43.409808 543705 cpu.go:282] Add success.
I0319 15:22:43.420013 543705 net.go:648] Add success.
I0319 15:22:43.422648 543705 net.go:770] primary dev: ETH0
I0319 15:22:43.422661 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:22:43.422688 543705 net.go:698] Add success.
I0319 15:22:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:22:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:22:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:22:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:22:53.409788 543705 memory.go:184] no items to output this cycle
I0319 15:22:53.409791 543705 cpu.go:275] no items to output this cycle
E0319 15:23:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:23:03.409779 543705 memory.go:184] no items to output this cycle
I0319 15:23:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 15:23:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:23:13.409803 543705 cpu.go:282] Add success.
I0319 15:23:13.409807 543705 memory.go:191] Add success.
W0319 15:23:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:23:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:23:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:23:13.420164 543705 net.go:648] Add success.
I0319 15:23:13.423171 543705 net.go:770] primary dev: ETH0
I0319 15:23:13.423184 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:23:13.423197 543705 net.go:698] Add success.
I0319 15:23:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:23:14.455183 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:23:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0319 15:23:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:23:14.456593 543705 disk_worker.go:494] system disk:vda1
I0319 15:23:14.456624 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:23:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:23:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:23:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:23:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:23:16.472394 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:23:19.725262 543705 disk_info.go:125] begin check local disk info of client
I0319 15:23:19.727736 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:23:19.727742 543705 disk_info.go:196] parse disk info done, disk is : [0xc000348d80 0xc000348dc0]
E0319 15:23:23.410319 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:23:23.410328 543705 cpu.go:275] no items to output this cycle
I0319 15:23:23.410334 543705 memory.go:184] no items to output this cycle
E0319 15:23:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:23:33.409806 543705 memory.go:184] no items to output this cycle
I0319 15:23:33.409821 543705 cpu.go:275] no items to output this cycle
E0319 15:23:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:23:43.409804 543705 memory.go:191] Add success.
I0319 15:23:43.409806 543705 cpu.go:282] Add success.
I0319 15:23:43.419949 543705 net.go:648] Add success.
I0319 15:23:43.422542 543705 net.go:770] primary dev: ETH0
I0319 15:23:43.422557 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:23:43.422571 543705 net.go:698] Add success.
I0319 15:23:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:23:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:23:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:23:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:23:53.409804 543705 memory.go:184] no items to output this cycle
I0319 15:23:53.409814 543705 cpu.go:275] no items to output this cycle
E0319 15:24:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:24:03.409774 543705 memory.go:184] no items to output this cycle
I0319 15:24:03.409779 543705 cpu.go:275] no items to output this cycle
E0319 15:24:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:24:13.409801 543705 memory.go:191] Add success.
I0319 15:24:13.409802 543705 cpu.go:282] Add success.
W0319 15:24:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:24:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:24:13.409860 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:24:13.420347 543705 net.go:648] Add success.
I0319 15:24:13.423260 543705 net.go:770] primary dev: ETH0
I0319 15:24:13.423275 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:24:13.423290 543705 net.go:698] Add success.
I0319 15:24:13.463334 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1089aa0b-4eab-44ba-8870-2de1300fad75","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 15:24:13.463369 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 15:24:14.454980 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:24:14.455213 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:24:14.455224 543705 disk_worker.go:708] disk space is not compliant
W0319 15:24:14.455227 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:24:14.456803 543705 disk_worker.go:494] system disk:vda1
I0319 15:24:14.456850 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:24:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:24:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:24:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:24:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:24:16.472109 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:24:19.729290 543705 disk_info.go:125] begin check local disk info of client
I0319 15:24:19.731779 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:24:19.731785 543705 disk_info.go:196] parse disk info done, disk is : [0xc000326000 0xc000326040]
E0319 15:24:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:24:23.409781 543705 memory.go:184] no items to output this cycle
I0319 15:24:23.409786 543705 cpu.go:275] no items to output this cycle
E0319 15:24:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:24:33.409774 543705 memory.go:184] no items to output this cycle
I0319 15:24:33.409797 543705 cpu.go:275] no items to output this cycle
I0319 15:24:37.749879 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 15:24:37.749885 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 15:24:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:24:43.410796 543705 memory.go:191] Add success.
I0319 15:24:43.409797 543705 cpu.go:282] Add success.
I0319 15:24:43.420533 543705 net.go:648] Add success.
I0319 15:24:43.423550 543705 net.go:770] primary dev: ETH0
I0319 15:24:43.423565 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:24:43.423580 543705 net.go:698] Add success.
I0319 15:24:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:24:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:24:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:24:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:24:53.409779 543705 memory.go:184] no items to output this cycle
I0319 15:24:53.409781 543705 cpu.go:275] no items to output this cycle
E0319 15:25:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:25:03.409773 543705 memory.go:184] no items to output this cycle
I0319 15:25:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 15:25:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:25:13.409791 543705 memory.go:191] Add success.
I0319 15:25:13.409802 543705 cpu.go:282] Add success.
W0319 15:25:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:25:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:25:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:25:13.420059 543705 net.go:648] Add success.
I0319 15:25:13.422813 543705 net.go:770] primary dev: ETH0
I0319 15:25:13.422826 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:25:13.422838 543705 net.go:698] Add success.
I0319 15:25:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:25:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:25:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0319 15:25:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:25:14.456589 543705 disk_worker.go:494] system disk:vda1
I0319 15:25:14.456618 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:25:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:25:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:25:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:25:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:25:16.472090 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:25:19.733300 543705 disk_info.go:125] begin check local disk info of client
I0319 15:25:19.735737 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:25:19.735744 543705 disk_info.go:196] parse disk info done, disk is : [0xc000314740 0xc000314780]
E0319 15:25:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:25:23.409775 543705 memory.go:184] no items to output this cycle
I0319 15:25:23.409776 543705 cpu.go:275] no items to output this cycle
E0319 15:25:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:25:33.409802 543705 memory.go:184] no items to output this cycle
I0319 15:25:33.409811 543705 cpu.go:275] no items to output this cycle
E0319 15:25:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:25:43.409780 543705 memory.go:191] Add success.
I0319 15:25:43.409801 543705 cpu.go:282] Add success.
I0319 15:25:43.420015 543705 net.go:648] Add success.
I0319 15:25:43.423108 543705 net.go:770] primary dev: ETH0
I0319 15:25:43.423124 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:25:43.423138 543705 net.go:698] Add success.
I0319 15:25:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:25:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:25:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:25:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:25:53.409784 543705 memory.go:184] no items to output this cycle
I0319 15:25:53.409789 543705 cpu.go:275] no items to output this cycle
E0319 15:26:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:26:03.409772 543705 memory.go:184] no items to output this cycle
I0319 15:26:03.409797 543705 cpu.go:275] no items to output this cycle
E0319 15:26:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:26:13.409822 543705 memory.go:191] Add success.
I0319 15:26:13.409827 543705 cpu.go:282] Add success.
W0319 15:26:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:26:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:26:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:26:13.420529 543705 net.go:648] Add success.
I0319 15:26:13.423283 543705 net.go:770] primary dev: ETH0
I0319 15:26:13.423295 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:26:13.423307 543705 net.go:698] Add success.
I0319 15:26:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:26:14.455194 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:26:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0319 15:26:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:26:14.456581 543705 disk_worker.go:494] system disk:vda1
I0319 15:26:14.456613 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:26:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:26:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:26:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:26:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:26:16.472090 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:26:19.735828 543705 disk_info.go:125] begin check local disk info of client
I0319 15:26:19.738289 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:26:19.738296 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b2080 0xc0003b20c0]
E0319 15:26:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:26:23.409776 543705 memory.go:184] no items to output this cycle
I0319 15:26:23.409780 543705 cpu.go:275] no items to output this cycle
E0319 15:26:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:26:33.409800 543705 memory.go:184] no items to output this cycle
I0319 15:26:33.409810 543705 cpu.go:275] no items to output this cycle
E0319 15:26:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:26:43.409788 543705 memory.go:191] Add success.
I0319 15:26:43.409811 543705 cpu.go:282] Add success.
I0319 15:26:43.419966 543705 net.go:648] Add success.
I0319 15:26:43.422626 543705 net.go:770] primary dev: ETH0
I0319 15:26:43.422639 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:26:43.422653 543705 net.go:698] Add success.
I0319 15:26:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:26:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:26:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:26:53.410208 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:26:53.410217 543705 cpu.go:275] no items to output this cycle
I0319 15:26:53.410222 543705 memory.go:184] no items to output this cycle
E0319 15:27:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:27:03.409797 543705 memory.go:184] no items to output this cycle
I0319 15:27:03.409814 543705 cpu.go:275] no items to output this cycle
E0319 15:27:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:27:13.409783 543705 memory.go:191] Add success.
I0319 15:27:13.409805 543705 cpu.go:282] Add success.
W0319 15:27:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:27:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:27:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:27:13.420129 543705 net.go:648] Add success.
I0319 15:27:13.429061 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 15:27:13.429148 543705 net.go:770] primary dev: ETH0
I0319 15:27:13.429160 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:27:13.429171 543705 net.go:698] Add success.
I0319 15:27:13.453662 543705 event_worker.go:152] Polling the log file for events...
I0319 15:27:13.470423 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"31e06cdf-7a7b-43a8-95c1-0734914a3ffb","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 15:27:13.470459 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 15:27:14.455142 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:27:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0319 15:27:14.455205 543705 disk_worker.go:728] disk inode is not compliant
E0319 15:27:14.455901 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 15:27:14.455910 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 15:27:14.455915 543705 custom_config.go:64] query custom config with name: gpu
I0319 15:27:14.456578 543705 disk_worker.go:494] system disk:vda1
I0319 15:27:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 15:27:15.456824 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 15:27:15.456859 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:27:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 15:27:16.457988 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 15:27:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:27:16.458046 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:27:16.472439 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:27:19.739324 543705 disk_info.go:125] begin check local disk info of client
I0319 15:27:19.741851 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:27:19.741858 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039c0c0 0xc00039c100]
E0319 15:27:23.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:27:23.409759 543705 memory.go:184] no items to output this cycle
I0319 15:27:23.409793 543705 cpu.go:275] no items to output this cycle
E0319 15:27:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:27:33.409768 543705 memory.go:184] no items to output this cycle
I0319 15:27:33.409792 543705 cpu.go:275] no items to output this cycle
I0319 15:27:37.750027 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 15:27:37.750034 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 15:27:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:27:43.410674 543705 memory.go:191] Add success.
I0319 15:27:43.409820 543705 cpu.go:282] Add success.
I0319 15:27:43.420431 543705 net.go:648] Add success.
I0319 15:27:43.423422 543705 net.go:770] primary dev: ETH0
I0319 15:27:43.423435 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:27:43.423448 543705 net.go:698] Add success.
I0319 15:27:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:27:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:27:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:27:53.410409 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:27:53.410425 543705 memory.go:184] no items to output this cycle
I0319 15:27:53.410440 543705 cpu.go:275] no items to output this cycle
E0319 15:28:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:28:03.409797 543705 memory.go:184] no items to output this cycle
I0319 15:28:03.409813 543705 cpu.go:275] no items to output this cycle
E0319 15:28:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:28:13.409793 543705 memory.go:191] Add success.
W0319 15:28:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 15:28:13.409826 543705 cpu.go:282] Add success.
W0319 15:28:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:28:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:28:13.420159 543705 net.go:648] Add success.
I0319 15:28:13.422585 543705 net.go:770] primary dev: ETH0
I0319 15:28:13.422598 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:28:13.422611 543705 net.go:698] Add success.
I0319 15:28:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:28:14.455089 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:28:14.455163 543705 disk_worker.go:708] disk space is not compliant
W0319 15:28:14.455167 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:28:14.456731 543705 disk_worker.go:494] system disk:vda1
I0319 15:28:14.456761 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:28:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:28:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:28:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:28:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:28:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:28:19.743347 543705 disk_info.go:125] begin check local disk info of client
I0319 15:28:19.745777 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:28:19.745784 543705 disk_info.go:196] parse disk info done, disk is : [0xc000352000 0xc000352040]
E0319 15:28:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:28:23.409792 543705 memory.go:184] no items to output this cycle
I0319 15:28:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 15:28:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:28:33.409806 543705 memory.go:184] no items to output this cycle
I0319 15:28:33.409817 543705 cpu.go:275] no items to output this cycle
E0319 15:28:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:28:43.409795 543705 memory.go:191] Add success.
I0319 15:28:43.409810 543705 cpu.go:282] Add success.
I0319 15:28:43.419885 543705 net.go:648] Add success.
I0319 15:28:43.422651 543705 net.go:770] primary dev: ETH0
I0319 15:28:43.422664 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:28:43.422677 543705 net.go:698] Add success.
I0319 15:28:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:28:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:28:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:28:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:28:53.409792 543705 memory.go:184] no items to output this cycle
I0319 15:28:53.409804 543705 cpu.go:275] no items to output this cycle
E0319 15:29:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:29:03.409783 543705 memory.go:184] no items to output this cycle
I0319 15:29:03.409786 543705 cpu.go:275] no items to output this cycle
E0319 15:29:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:29:13.409799 543705 memory.go:191] Add success.
I0319 15:29:13.409801 543705 cpu.go:282] Add success.
W0319 15:29:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:29:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:29:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:29:13.420171 543705 net.go:648] Add success.
I0319 15:29:13.423133 543705 net.go:770] primary dev: ETH0
I0319 15:29:13.423148 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:29:13.423163 543705 net.go:698] Add success.
I0319 15:29:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:29:14.455086 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:29:14.455148 543705 disk_worker.go:708] disk space is not compliant
W0319 15:29:14.455150 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:29:14.456498 543705 disk_worker.go:494] system disk:vda1
I0319 15:29:14.456543 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:29:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:29:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:29:16.458073 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:29:16.458095 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:29:16.472466 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:29:19.747373 543705 disk_info.go:125] begin check local disk info of client
I0319 15:29:19.749769 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:29:19.749775 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ba100 0xc0003ba140]
E0319 15:29:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:29:23.409788 543705 memory.go:184] no items to output this cycle
I0319 15:29:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 15:29:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:29:33.409779 543705 cpu.go:275] no items to output this cycle
I0319 15:29:33.409791 543705 memory.go:184] no items to output this cycle
E0319 15:29:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:29:43.409812 543705 memory.go:191] Add success.
I0319 15:29:43.409820 543705 cpu.go:282] Add success.
I0319 15:29:43.419998 543705 net.go:648] Add success.
I0319 15:29:43.423111 543705 net.go:770] primary dev: ETH0
I0319 15:29:43.423125 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:29:43.423137 543705 net.go:698] Add success.
I0319 15:29:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:29:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:29:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:29:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:29:53.409781 543705 memory.go:184] no items to output this cycle
I0319 15:29:53.409792 543705 cpu.go:275] no items to output this cycle
E0319 15:30:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:30:03.409794 543705 memory.go:184] no items to output this cycle
I0319 15:30:03.409811 543705 cpu.go:275] no items to output this cycle
E0319 15:30:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:30:13.409801 543705 cpu.go:282] Add success.
I0319 15:30:13.409811 543705 memory.go:191] Add success.
W0319 15:30:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:30:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:30:13.409855 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:30:13.420126 543705 net.go:648] Add success.
I0319 15:30:13.422625 543705 net.go:770] primary dev: ETH0
I0319 15:30:13.422639 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:30:13.422652 543705 net.go:698] Add success.
I0319 15:30:13.483204 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c372f1a8-0f07-4d4d-855a-03dc1e6c84e0","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 15:30:13.483239 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 15:30:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:30:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:30:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0319 15:30:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:30:14.456865 543705 disk_worker.go:494] system disk:vda1
I0319 15:30:14.456894 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:30:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:30:16.457577 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:30:16.457657 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:30:16.457685 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:30:16.473044 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:30:19.751391 543705 disk_info.go:125] begin check local disk info of client
I0319 15:30:19.753816 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:30:19.753822 543705 disk_info.go:196] parse disk info done, disk is : [0xc000356080 0xc0003560c0]
E0319 15:30:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:30:23.409795 543705 memory.go:184] no items to output this cycle
I0319 15:30:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 15:30:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:30:33.409780 543705 cpu.go:275] no items to output this cycle
I0319 15:30:33.409784 543705 memory.go:184] no items to output this cycle
I0319 15:30:37.751464 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 15:30:37.751471 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 15:30:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:30:43.410788 543705 memory.go:191] Add success.
I0319 15:30:43.409808 543705 cpu.go:282] Add success.
I0319 15:30:43.420595 543705 net.go:648] Add success.
I0319 15:30:43.423931 543705 net.go:770] primary dev: ETH0
I0319 15:30:43.423946 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:30:43.423960 543705 net.go:698] Add success.
I0319 15:30:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:30:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:30:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:30:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:30:53.409772 543705 memory.go:184] no items to output this cycle
I0319 15:30:53.409797 543705 cpu.go:275] no items to output this cycle
E0319 15:31:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:31:03.409788 543705 memory.go:184] no items to output this cycle
I0319 15:31:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 15:31:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:31:13.409797 543705 memory.go:191] Add success.
I0319 15:31:13.409806 543705 cpu.go:282] Add success.
W0319 15:31:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:31:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:31:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:31:13.420294 543705 net.go:648] Add success.
I0319 15:31:13.423029 543705 net.go:770] primary dev: ETH0
I0319 15:31:13.423044 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:31:13.423058 543705 net.go:698] Add success.
I0319 15:31:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:31:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:31:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0319 15:31:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:31:14.456597 543705 disk_worker.go:494] system disk:vda1
I0319 15:31:14.456626 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:31:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:31:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:31:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:31:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:31:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:31:19.753907 543705 disk_info.go:125] begin check local disk info of client
I0319 15:31:19.756309 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:31:19.756315 543705 disk_info.go:196] parse disk info done, disk is : [0xc000256340 0xc000256380]
E0319 15:31:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:31:23.409787 543705 memory.go:184] no items to output this cycle
I0319 15:31:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 15:31:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:31:33.409782 543705 cpu.go:275] no items to output this cycle
I0319 15:31:33.409794 543705 memory.go:184] no items to output this cycle
E0319 15:31:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:31:43.409804 543705 memory.go:191] Add success.
I0319 15:31:43.409814 543705 cpu.go:282] Add success.
I0319 15:31:43.419879 543705 net.go:648] Add success.
I0319 15:31:43.423148 543705 net.go:770] primary dev: ETH0
I0319 15:31:43.423160 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:31:43.423174 543705 net.go:698] Add success.
I0319 15:31:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:31:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:31:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:31:53.410351 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:31:53.410368 543705 memory.go:184] no items to output this cycle
I0319 15:31:53.410389 543705 cpu.go:275] no items to output this cycle
E0319 15:32:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:32:03.409784 543705 memory.go:184] no items to output this cycle
I0319 15:32:03.409790 543705 cpu.go:275] no items to output this cycle
E0319 15:32:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:32:13.409798 543705 memory.go:191] Add success.
I0319 15:32:13.409799 543705 cpu.go:282] Add success.
W0319 15:32:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:32:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:32:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:32:13.420072 543705 net.go:648] Add success.
I0319 15:32:13.422830 543705 net.go:770] primary dev: ETH0
I0319 15:32:13.422847 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:32:13.422860 543705 net.go:698] Add success.
W0319 15:32:14.455166 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:32:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0319 15:32:14.455181 543705 disk_worker.go:728] disk inode is not compliant
E0319 15:32:14.456174 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 15:32:14.456184 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 15:32:14.456190 543705 custom_config.go:64] query custom config with name: gpu
I0319 15:32:14.456487 543705 disk_worker.go:494] system disk:vda1
I0319 15:32:14.456519 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 15:32:15.456813 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 15:32:15.456824 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:32:16.457954 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 15:32:16.457954 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 15:32:16.458008 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:32:16.458029 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:32:16.472442 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:32:19.757420 543705 disk_info.go:125] begin check local disk info of client
I0319 15:32:19.759831 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:32:19.759837 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bd500 0xc0002bd540]
E0319 15:32:23.409841 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:32:23.409861 543705 memory.go:184] no items to output this cycle
I0319 15:32:23.409944 543705 cpu.go:275] no items to output this cycle
E0319 15:32:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:32:33.409775 543705 memory.go:184] no items to output this cycle
I0319 15:32:33.409798 543705 cpu.go:275] no items to output this cycle
E0319 15:32:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:32:43.409795 543705 memory.go:191] Add success.
I0319 15:32:43.409832 543705 cpu.go:282] Add success.
I0319 15:32:43.419964 543705 net.go:648] Add success.
I0319 15:32:43.422798 543705 net.go:770] primary dev: ETH0
I0319 15:32:43.422811 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:32:43.422823 543705 net.go:698] Add success.
I0319 15:32:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:32:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:32:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:32:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:32:53.409788 543705 cpu.go:275] no items to output this cycle
I0319 15:32:53.409789 543705 memory.go:184] no items to output this cycle
E0319 15:33:03.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:33:03.409818 543705 memory.go:184] no items to output this cycle
I0319 15:33:03.409828 543705 cpu.go:275] no items to output this cycle
E0319 15:33:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:33:13.409803 543705 memory.go:191] Add success.
I0319 15:33:13.409821 543705 cpu.go:282] Add success.
W0319 15:33:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:33:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:33:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:33:13.420197 543705 net.go:648] Add success.
I0319 15:33:13.423154 543705 net.go:770] primary dev: ETH0
I0319 15:33:13.423169 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:33:13.423184 543705 net.go:698] Add success.
I0319 15:33:13.469257 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7790ec70-d2c8-4454-a627-28a6d1988430","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 15:33:13.469291 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 15:33:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:33:14.455200 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:33:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0319 15:33:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:33:14.456596 543705 disk_worker.go:494] system disk:vda1
I0319 15:33:14.456628 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:33:15.456022 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:33:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:33:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:33:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:33:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:33:19.760443 543705 disk_info.go:125] begin check local disk info of client
I0319 15:33:19.762887 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:33:19.762893 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bc2c0 0xc0002bc300]
E0319 15:33:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:33:23.409795 543705 memory.go:184] no items to output this cycle
I0319 15:33:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 15:33:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:33:33.409809 543705 memory.go:184] no items to output this cycle
I0319 15:33:33.409820 543705 cpu.go:275] no items to output this cycle
I0319 15:33:37.752465 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 15:33:37.752471 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 15:33:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:33:43.410829 543705 memory.go:191] Add success.
I0319 15:33:43.409811 543705 cpu.go:282] Add success.
I0319 15:33:43.420611 543705 net.go:648] Add success.
I0319 15:33:43.423338 543705 net.go:770] primary dev: ETH0
I0319 15:33:43.423353 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:33:43.423368 543705 net.go:698] Add success.
I0319 15:33:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:33:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:33:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:33:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:33:53.409791 543705 memory.go:184] no items to output this cycle
I0319 15:33:53.409797 543705 cpu.go:275] no items to output this cycle
E0319 15:34:03.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:34:03.409815 543705 memory.go:184] no items to output this cycle
I0319 15:34:03.409828 543705 cpu.go:275] no items to output this cycle
E0319 15:34:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:34:13.409810 543705 memory.go:191] Add success.
I0319 15:34:13.409810 543705 cpu.go:282] Add success.
W0319 15:34:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:34:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:34:13.409854 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:34:13.420131 543705 net.go:648] Add success.
I0319 15:34:13.423224 543705 net.go:770] primary dev: ETH0
I0319 15:34:13.423238 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:34:13.423253 543705 net.go:698] Add success.
I0319 15:34:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:34:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:34:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0319 15:34:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:34:14.456590 543705 disk_worker.go:494] system disk:vda1
I0319 15:34:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:34:15.455949 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:34:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:34:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:34:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:34:16.472374 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:34:19.764456 543705 disk_info.go:125] begin check local disk info of client
I0319 15:34:19.766910 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:34:19.766916 543705 disk_info.go:196] parse disk info done, disk is : [0xc000494340 0xc000494380]
E0319 15:34:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:34:23.409764 543705 memory.go:184] no items to output this cycle
I0319 15:34:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 15:34:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:34:33.409789 543705 cpu.go:275] no items to output this cycle
I0319 15:34:33.409791 543705 memory.go:184] no items to output this cycle
E0319 15:34:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:34:43.409820 543705 memory.go:191] Add success.
I0319 15:34:43.409837 543705 cpu.go:282] Add success.
I0319 15:34:43.419980 543705 net.go:648] Add success.
I0319 15:34:43.422774 543705 net.go:770] primary dev: ETH0
I0319 15:34:43.422789 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:34:43.422804 543705 net.go:698] Add success.
I0319 15:34:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:34:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:34:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:34:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:34:53.409779 543705 memory.go:184] no items to output this cycle
I0319 15:34:53.409779 543705 cpu.go:275] no items to output this cycle
E0319 15:35:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:35:03.409794 543705 cpu.go:275] no items to output this cycle
I0319 15:35:03.409800 543705 memory.go:184] no items to output this cycle
E0319 15:35:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:35:13.409792 543705 memory.go:191] Add success.
I0319 15:35:13.409805 543705 cpu.go:282] Add success.
W0319 15:35:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:35:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:35:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:35:13.420128 543705 net.go:648] Add success.
I0319 15:35:13.422687 543705 net.go:770] primary dev: ETH0
I0319 15:35:13.422703 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:35:13.422721 543705 net.go:698] Add success.
I0319 15:35:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:35:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:35:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0319 15:35:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:35:14.456583 543705 disk_worker.go:494] system disk:vda1
I0319 15:35:14.456615 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:35:15.456020 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:35:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:35:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:35:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:35:16.472381 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:35:19.768467 543705 disk_info.go:125] begin check local disk info of client
I0319 15:35:19.770868 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:35:19.770874 543705 disk_info.go:196] parse disk info done, disk is : [0xc000266080 0xc0002660c0]
E0319 15:35:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:35:23.409798 543705 memory.go:184] no items to output this cycle
I0319 15:35:23.409810 543705 cpu.go:275] no items to output this cycle
E0319 15:35:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:35:33.409767 543705 memory.go:184] no items to output this cycle
I0319 15:35:33.409795 543705 cpu.go:275] no items to output this cycle
E0319 15:35:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:35:43.409798 543705 memory.go:191] Add success.
I0319 15:35:43.409800 543705 cpu.go:282] Add success.
I0319 15:35:43.419906 543705 net.go:648] Add success.
I0319 15:35:43.422551 543705 net.go:770] primary dev: ETH0
I0319 15:35:43.422566 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:35:43.422581 543705 net.go:698] Add success.
I0319 15:35:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:35:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:35:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:35:53.410271 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:35:53.410289 543705 memory.go:184] no items to output this cycle
I0319 15:35:53.410293 543705 cpu.go:275] no items to output this cycle
E0319 15:36:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:36:03.409791 543705 memory.go:184] no items to output this cycle
I0319 15:36:03.409797 543705 cpu.go:275] no items to output this cycle
E0319 15:36:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:36:13.409818 543705 memory.go:191] Add success.
I0319 15:36:13.409824 543705 cpu.go:282] Add success.
W0319 15:36:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:36:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:36:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:36:13.420210 543705 net.go:648] Add success.
I0319 15:36:13.423057 543705 net.go:770] primary dev: ETH0
I0319 15:36:13.423071 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:36:13.423085 543705 net.go:698] Add success.
I0319 15:36:13.468442 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3b8d7efd-401f-48b1-bfe2-d4405781b8bb","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 15:36:13.468477 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 15:36:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:36:14.455102 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:36:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0319 15:36:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:36:14.456510 543705 disk_worker.go:494] system disk:vda1
I0319 15:36:14.456556 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:36:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:36:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:36:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:36:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:36:16.472405 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:36:19.772497 543705 disk_info.go:125] begin check local disk info of client
I0319 15:36:19.774947 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:36:19.774953 543705 disk_info.go:196] parse disk info done, disk is : [0xc000348300 0xc000348340]
E0319 15:36:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:36:23.409794 543705 memory.go:184] no items to output this cycle
I0319 15:36:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 15:36:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:36:33.409779 543705 cpu.go:275] no items to output this cycle
I0319 15:36:33.409781 543705 memory.go:184] no items to output this cycle
I0319 15:36:37.752629 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 15:36:37.752635 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 15:36:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:36:43.410676 543705 memory.go:191] Add success.
I0319 15:36:43.409801 543705 cpu.go:282] Add success.
I0319 15:36:43.420455 543705 net.go:648] Add success.
I0319 15:36:43.423215 543705 net.go:770] primary dev: ETH0
I0319 15:36:43.423228 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:36:43.423241 543705 net.go:698] Add success.
I0319 15:36:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:36:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:36:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:36:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:36:53.409769 543705 memory.go:184] no items to output this cycle
I0319 15:36:53.409778 543705 cpu.go:275] no items to output this cycle
E0319 15:37:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:37:03.409799 543705 memory.go:184] no items to output this cycle
I0319 15:37:03.409814 543705 cpu.go:275] no items to output this cycle
E0319 15:37:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:37:13.409787 543705 memory.go:191] Add success.
I0319 15:37:13.409811 543705 cpu.go:282] Add success.
W0319 15:37:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:37:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:37:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:37:13.420135 543705 net.go:648] Add success.
I0319 15:37:13.423150 543705 net.go:770] primary dev: ETH0
I0319 15:37:13.423163 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:37:13.423174 543705 net.go:698] Add success.
I0319 15:37:13.453724 543705 event_worker.go:152] Polling the log file for events...
W0319 15:37:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:37:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 15:37:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:37:14.456863 543705 disk_worker.go:494] system disk:vda1
I0319 15:37:14.456902 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 15:37:14.457686 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 15:37:14.457713 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 15:37:14.457718 543705 custom_config.go:64] query custom config with name: gpu
E0319 15:37:15.456828 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 15:37:15.456836 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:37:16.457942 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 15:37:16.457943 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 15:37:16.458000 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:37:16.458019 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:37:16.472333 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:37:19.776566 543705 disk_info.go:125] begin check local disk info of client
I0319 15:37:19.778986 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:37:19.778992 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035e3c0 0xc00035e400]
E0319 15:37:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:37:23.409776 543705 memory.go:184] no items to output this cycle
I0319 15:37:23.409777 543705 cpu.go:275] no items to output this cycle
E0319 15:37:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:37:33.409804 543705 memory.go:184] no items to output this cycle
I0319 15:37:33.409815 543705 cpu.go:275] no items to output this cycle
E0319 15:37:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:37:43.409781 543705 memory.go:191] Add success.
I0319 15:37:43.409788 543705 cpu.go:282] Add success.
I0319 15:37:43.419871 543705 net.go:648] Add success.
I0319 15:37:43.422837 543705 net.go:770] primary dev: ETH0
I0319 15:37:43.422850 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:37:43.422862 543705 net.go:698] Add success.
I0319 15:37:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:37:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:37:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:37:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:37:53.409765 543705 memory.go:184] no items to output this cycle
I0319 15:37:53.409800 543705 cpu.go:275] no items to output this cycle
E0319 15:38:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:38:03.409779 543705 memory.go:184] no items to output this cycle
I0319 15:38:03.409804 543705 cpu.go:275] no items to output this cycle
E0319 15:38:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:38:13.409828 543705 memory.go:191] Add success.
I0319 15:38:13.409835 543705 cpu.go:282] Add success.
W0319 15:38:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:38:13.409876 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:38:13.409880 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:38:13.420165 543705 net.go:648] Add success.
I0319 15:38:13.422956 543705 net.go:770] primary dev: ETH0
I0319 15:38:13.422970 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:38:13.422982 543705 net.go:698] Add success.
I0319 15:38:14.455021 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:38:14.455162 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:38:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0319 15:38:14.455176 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:38:14.456564 543705 disk_worker.go:494] system disk:vda1
I0319 15:38:14.456592 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:38:15.455949 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:38:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:38:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:38:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:38:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:38:19.780539 543705 disk_info.go:125] begin check local disk info of client
I0319 15:38:19.783043 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:38:19.783050 543705 disk_info.go:196] parse disk info done, disk is : [0xc00051e900 0xc00051e940]
E0319 15:38:23.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:38:23.409760 543705 memory.go:184] no items to output this cycle
I0319 15:38:23.409796 543705 cpu.go:275] no items to output this cycle
E0319 15:38:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:38:33.409808 543705 memory.go:184] no items to output this cycle
I0319 15:38:33.409824 543705 cpu.go:275] no items to output this cycle
E0319 15:38:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:38:43.409800 543705 memory.go:191] Add success.
I0319 15:38:43.409825 543705 cpu.go:282] Add success.
I0319 15:38:43.419868 543705 net.go:648] Add success.
I0319 15:38:43.423138 543705 net.go:770] primary dev: ETH0
I0319 15:38:43.423152 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:38:43.423164 543705 net.go:698] Add success.
I0319 15:38:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:38:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:38:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:38:53.409809 543705 cpu.go:275] no items to output this cycle
E0319 15:38:53.409809 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:38:53.409830 543705 memory.go:184] no items to output this cycle
E0319 15:39:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:39:03.409787 543705 memory.go:184] no items to output this cycle
I0319 15:39:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 15:39:13.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:39:13.409836 543705 memory.go:191] Add success.
I0319 15:39:13.409842 543705 cpu.go:282] Add success.
W0319 15:39:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:39:13.409884 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:39:13.409888 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:39:13.420182 543705 net.go:648] Add success.
I0319 15:39:13.422825 543705 net.go:770] primary dev: ETH0
I0319 15:39:13.422838 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:39:13.422850 543705 net.go:698] Add success.
I0319 15:39:13.469494 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7cf8f641-2e05-44de-abaf-4a8e3f09be46","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 15:39:13.469528 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 15:39:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:39:14.455129 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:39:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0319 15:39:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:39:14.456608 543705 disk_worker.go:494] system disk:vda1
I0319 15:39:14.456639 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:39:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:39:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:39:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:39:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:39:16.472435 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:39:19.784555 543705 disk_info.go:125] begin check local disk info of client
I0319 15:39:19.786982 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:39:19.786988 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bc440 0xc0002bc480]
E0319 15:39:23.410230 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:39:23.410246 543705 memory.go:184] no items to output this cycle
I0319 15:39:23.410274 543705 cpu.go:275] no items to output this cycle
E0319 15:39:33.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:39:33.409816 543705 memory.go:184] no items to output this cycle
I0319 15:39:33.409828 543705 cpu.go:275] no items to output this cycle
I0319 15:39:37.752789 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 15:39:37.752795 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 15:39:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:39:43.410632 543705 memory.go:191] Add success.
I0319 15:39:43.409831 543705 cpu.go:282] Add success.
I0319 15:39:43.420317 543705 net.go:648] Add success.
I0319 15:39:43.423088 543705 net.go:770] primary dev: ETH0
I0319 15:39:43.423102 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:39:43.423113 543705 net.go:698] Add success.
I0319 15:39:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:39:46.458063 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:39:46.458090 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:39:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:39:53.409776 543705 memory.go:184] no items to output this cycle
I0319 15:39:53.409805 543705 cpu.go:275] no items to output this cycle
E0319 15:40:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:40:03.409785 543705 memory.go:184] no items to output this cycle
I0319 15:40:03.409789 543705 cpu.go:275] no items to output this cycle
E0319 15:40:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:40:13.409795 543705 memory.go:191] Add success.
I0319 15:40:13.409798 543705 cpu.go:282] Add success.
W0319 15:40:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:40:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:40:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:40:13.420236 543705 net.go:648] Add success.
I0319 15:40:13.422923 543705 net.go:770] primary dev: ETH0
I0319 15:40:13.422939 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:40:13.422952 543705 net.go:698] Add success.
I0319 15:40:14.454954 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:40:14.455108 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:40:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0319 15:40:14.455176 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:40:14.456522 543705 disk_worker.go:494] system disk:vda1
I0319 15:40:14.456568 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:40:15.455950 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:40:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:40:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:40:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:40:16.472368 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:40:19.788572 543705 disk_info.go:125] begin check local disk info of client
I0319 15:40:19.791095 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:40:19.791101 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bc140 0xc0002bc180]
E0319 15:40:23.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:40:23.409759 543705 memory.go:184] no items to output this cycle
I0319 15:40:23.409795 543705 cpu.go:275] no items to output this cycle
E0319 15:40:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:40:33.409798 543705 memory.go:184] no items to output this cycle
I0319 15:40:33.409816 543705 cpu.go:275] no items to output this cycle
E0319 15:40:43.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:40:43.409829 543705 memory.go:191] Add success.
I0319 15:40:43.409830 543705 cpu.go:282] Add success.
I0319 15:40:43.419990 543705 net.go:648] Add success.
I0319 15:40:43.423069 543705 net.go:770] primary dev: ETH0
I0319 15:40:43.423085 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:40:43.423099 543705 net.go:698] Add success.
I0319 15:40:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:40:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:40:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:40:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:40:53.409783 543705 cpu.go:275] no items to output this cycle
I0319 15:40:53.409787 543705 memory.go:184] no items to output this cycle
E0319 15:41:03.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:41:03.409824 543705 memory.go:184] no items to output this cycle
I0319 15:41:03.409838 543705 cpu.go:275] no items to output this cycle
E0319 15:41:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:41:13.409798 543705 memory.go:191] Add success.
I0319 15:41:13.409811 543705 cpu.go:282] Add success.
W0319 15:41:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:41:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:41:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:41:13.420343 543705 net.go:648] Add success.
I0319 15:41:13.423052 543705 net.go:770] primary dev: ETH0
I0319 15:41:13.423065 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:41:13.423076 543705 net.go:698] Add success.
I0319 15:41:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:41:14.455191 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:41:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0319 15:41:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:41:14.456611 543705 disk_worker.go:494] system disk:vda1
I0319 15:41:14.456642 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:41:15.456012 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:41:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:41:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:41:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:41:16.472424 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:41:19.792592 543705 disk_info.go:125] begin check local disk info of client
I0319 15:41:19.795046 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:41:19.795051 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a2080 0xc0002a20c0]
E0319 15:41:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:41:23.409798 543705 memory.go:184] no items to output this cycle
I0319 15:41:23.409812 543705 cpu.go:275] no items to output this cycle
E0319 15:41:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:41:33.409773 543705 memory.go:184] no items to output this cycle
I0319 15:41:33.409794 543705 cpu.go:275] no items to output this cycle
E0319 15:41:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:41:43.409812 543705 memory.go:191] Add success.
I0319 15:41:43.409819 543705 cpu.go:282] Add success.
I0319 15:41:43.419976 543705 net.go:648] Add success.
I0319 15:41:43.423271 543705 net.go:770] primary dev: ETH0
I0319 15:41:43.423287 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:41:43.423301 543705 net.go:698] Add success.
I0319 15:41:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:41:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:41:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:41:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:41:53.409811 543705 memory.go:184] no items to output this cycle
I0319 15:41:53.409819 543705 cpu.go:275] no items to output this cycle
E0319 15:42:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:42:03.409800 543705 memory.go:184] no items to output this cycle
I0319 15:42:03.409810 543705 cpu.go:275] no items to output this cycle
E0319 15:42:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:42:13.409824 543705 memory.go:191] Add success.
I0319 15:42:13.409831 543705 cpu.go:282] Add success.
W0319 15:42:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:42:13.409873 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:42:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:42:13.420180 543705 net.go:648] Add success.
I0319 15:42:13.422820 543705 net.go:770] primary dev: ETH0
I0319 15:42:13.422832 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:42:13.422845 543705 net.go:698] Add success.
I0319 15:42:13.469153 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"83f3560a-bf62-4ab9-a60f-4e0748766053","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 15:42:13.469195 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 15:42:14.455111 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:42:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0319 15:42:14.455173 543705 disk_worker.go:728] disk inode is not compliant
E0319 15:42:14.457032 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 15:42:14.457041 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 15:42:14.457047 543705 custom_config.go:64] query custom config with name: gpu
I0319 15:42:14.457116 543705 disk_worker.go:494] system disk:vda1
I0319 15:42:14.457169 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 15:42:15.456477 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 15:42:15.456487 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:42:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 15:42:16.457998 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 15:42:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:42:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:42:16.472426 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:42:19.796619 543705 disk_info.go:125] begin check local disk info of client
I0319 15:42:19.799084 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:42:19.799091 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b8000 0xc0002b8040]
E0319 15:42:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:42:23.409763 543705 memory.go:184] no items to output this cycle
I0319 15:42:23.409794 543705 cpu.go:275] no items to output this cycle
E0319 15:42:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:42:33.409767 543705 memory.go:184] no items to output this cycle
I0319 15:42:33.409805 543705 cpu.go:275] no items to output this cycle
I0319 15:42:37.753744 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 15:42:37.753751 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 15:42:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:42:43.410704 543705 memory.go:191] Add success.
I0319 15:42:43.409815 543705 cpu.go:282] Add success.
I0319 15:42:43.420428 543705 net.go:648] Add success.
I0319 15:42:43.423512 543705 net.go:770] primary dev: ETH0
I0319 15:42:43.423526 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:42:43.423541 543705 net.go:698] Add success.
I0319 15:42:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:42:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:42:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:42:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:42:53.409807 543705 memory.go:184] no items to output this cycle
I0319 15:42:53.409817 543705 cpu.go:275] no items to output this cycle
E0319 15:43:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:43:03.409798 543705 memory.go:184] no items to output this cycle
I0319 15:43:03.409812 543705 cpu.go:275] no items to output this cycle
E0319 15:43:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:43:13.409820 543705 memory.go:191] Add success.
I0319 15:43:13.409827 543705 cpu.go:282] Add success.
W0319 15:43:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:43:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:43:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:43:13.420149 543705 net.go:648] Add success.
I0319 15:43:13.422805 543705 net.go:770] primary dev: ETH0
I0319 15:43:13.422822 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:43:13.422836 543705 net.go:698] Add success.
I0319 15:43:14.454952 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:43:14.455193 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:43:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0319 15:43:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:43:14.456604 543705 disk_worker.go:494] system disk:vda1
I0319 15:43:14.456634 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:43:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:43:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:43:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:43:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:43:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:43:19.800464 543705 disk_info.go:125] begin check local disk info of client
I0319 15:43:19.802909 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:43:19.802915 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039e1c0 0xc00039e200]
E0319 15:43:23.410004 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:43:23.410032 543705 memory.go:184] no items to output this cycle
I0319 15:43:23.410191 543705 cpu.go:275] no items to output this cycle
E0319 15:43:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:43:33.409773 543705 memory.go:184] no items to output this cycle
I0319 15:43:33.409794 543705 cpu.go:275] no items to output this cycle
E0319 15:43:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:43:43.409792 543705 memory.go:191] Add success.
I0319 15:43:43.409793 543705 cpu.go:282] Add success.
I0319 15:43:43.419954 543705 net.go:648] Add success.
I0319 15:43:43.422680 543705 net.go:770] primary dev: ETH0
I0319 15:43:43.422695 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:43:43.422709 543705 net.go:698] Add success.
I0319 15:43:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:43:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:43:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:43:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:43:53.409809 543705 memory.go:184] no items to output this cycle
I0319 15:43:53.409818 543705 cpu.go:275] no items to output this cycle
E0319 15:44:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:44:03.409780 543705 memory.go:184] no items to output this cycle
I0319 15:44:03.409788 543705 cpu.go:275] no items to output this cycle
E0319 15:44:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:44:13.409797 543705 memory.go:191] Add success.
I0319 15:44:13.409805 543705 cpu.go:282] Add success.
W0319 15:44:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:44:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:44:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:44:13.420052 543705 net.go:648] Add success.
I0319 15:44:13.422757 543705 net.go:770] primary dev: ETH0
I0319 15:44:13.422770 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:44:13.422782 543705 net.go:698] Add success.
I0319 15:44:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:44:14.455165 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:44:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0319 15:44:14.455180 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:44:14.456519 543705 disk_worker.go:494] system disk:vda1
I0319 15:44:14.456566 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:44:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:44:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:44:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:44:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:44:16.472371 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:44:19.804650 543705 disk_info.go:125] begin check local disk info of client
I0319 15:44:19.807051 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:44:19.807056 543705 disk_info.go:196] parse disk info done, disk is : [0xc0005083c0 0xc000508400]
E0319 15:44:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:44:23.409764 543705 memory.go:184] no items to output this cycle
I0319 15:44:23.409891 543705 cpu.go:275] no items to output this cycle
E0319 15:44:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:44:33.409778 543705 memory.go:184] no items to output this cycle
I0319 15:44:33.409796 543705 cpu.go:275] no items to output this cycle
E0319 15:44:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:44:43.409803 543705 memory.go:191] Add success.
I0319 15:44:43.409819 543705 cpu.go:282] Add success.
I0319 15:44:43.419967 543705 net.go:648] Add success.
I0319 15:44:43.422624 543705 net.go:770] primary dev: ETH0
I0319 15:44:43.422637 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:44:43.422649 543705 net.go:698] Add success.
I0319 15:44:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:44:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:44:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:44:53.409808 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:44:53.409828 543705 memory.go:184] no items to output this cycle
I0319 15:44:53.409842 543705 cpu.go:275] no items to output this cycle
E0319 15:45:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:45:03.409773 543705 memory.go:184] no items to output this cycle
I0319 15:45:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 15:45:13.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:45:13.409834 543705 memory.go:191] Add success.
I0319 15:45:13.409847 543705 cpu.go:282] Add success.
W0319 15:45:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:45:13.409883 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:45:13.409888 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:45:13.420121 543705 net.go:648] Add success.
I0319 15:45:13.422789 543705 net.go:770] primary dev: ETH0
I0319 15:45:13.422802 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:45:13.422815 543705 net.go:698] Add success.
I0319 15:45:13.469262 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"32d56532-dad3-4796-8bb1-57dc859365f0","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 15:45:13.469302 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 15:45:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:45:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:45:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 15:45:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:45:14.456561 543705 disk_worker.go:494] system disk:vda1
I0319 15:45:14.456592 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:45:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:45:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:45:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:45:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:45:16.472383 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:45:19.808671 543705 disk_info.go:125] begin check local disk info of client
I0319 15:45:19.811110 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:45:19.811116 543705 disk_info.go:196] parse disk info done, disk is : [0xc000248a40 0xc000248a80]
E0319 15:45:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:45:23.409773 543705 memory.go:184] no items to output this cycle
I0319 15:45:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 15:45:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:45:33.409782 543705 memory.go:184] no items to output this cycle
I0319 15:45:33.409801 543705 cpu.go:275] no items to output this cycle
I0319 15:45:37.755473 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 15:45:37.755480 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 15:45:43.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:45:43.410760 543705 memory.go:191] Add success.
I0319 15:45:43.409812 543705 cpu.go:282] Add success.
I0319 15:45:43.420465 543705 net.go:648] Add success.
I0319 15:45:43.423121 543705 net.go:770] primary dev: ETH0
I0319 15:45:43.423133 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:45:43.423146 543705 net.go:698] Add success.
I0319 15:45:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:45:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:45:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:45:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:45:53.409802 543705 memory.go:184] no items to output this cycle
I0319 15:45:53.409813 543705 cpu.go:275] no items to output this cycle
E0319 15:46:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:46:03.409773 543705 memory.go:184] no items to output this cycle
I0319 15:46:03.409782 543705 cpu.go:275] no items to output this cycle
E0319 15:46:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:46:13.409783 543705 memory.go:191] Add success.
W0319 15:46:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 15:46:13.409815 543705 cpu.go:282] Add success.
W0319 15:46:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:46:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:46:13.420131 543705 net.go:648] Add success.
I0319 15:46:13.423118 543705 net.go:770] primary dev: ETH0
I0319 15:46:13.423131 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:46:13.423143 543705 net.go:698] Add success.
I0319 15:46:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:46:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:46:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0319 15:46:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:46:14.456510 543705 disk_worker.go:494] system disk:vda1
I0319 15:46:14.456554 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:46:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:46:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:46:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:46:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:46:16.472385 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:46:19.812699 543705 disk_info.go:125] begin check local disk info of client
I0319 15:46:19.815163 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:46:19.815170 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003bf800 0xc0003bf840]
E0319 15:46:23.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:46:23.409762 543705 memory.go:184] no items to output this cycle
I0319 15:46:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 15:46:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:46:33.409774 543705 memory.go:184] no items to output this cycle
I0319 15:46:33.409796 543705 cpu.go:275] no items to output this cycle
E0319 15:46:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:46:43.409805 543705 memory.go:191] Add success.
I0319 15:46:43.409821 543705 cpu.go:282] Add success.
I0319 15:46:43.419991 543705 net.go:648] Add success.
I0319 15:46:43.423129 543705 net.go:770] primary dev: ETH0
I0319 15:46:43.423142 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:46:43.423154 543705 net.go:698] Add success.
I0319 15:46:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:46:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:46:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:46:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:46:53.409773 543705 memory.go:184] no items to output this cycle
I0319 15:46:53.409781 543705 cpu.go:275] no items to output this cycle
E0319 15:47:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:47:03.409767 543705 memory.go:184] no items to output this cycle
I0319 15:47:03.409801 543705 cpu.go:275] no items to output this cycle
E0319 15:47:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:47:13.409792 543705 memory.go:191] Add success.
I0319 15:47:13.409815 543705 cpu.go:282] Add success.
W0319 15:47:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:47:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:47:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:47:13.420200 543705 net.go:648] Add success.
I0319 15:47:13.422972 543705 net.go:770] primary dev: ETH0
I0319 15:47:13.422988 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:47:13.423002 543705 net.go:698] Add success.
I0319 15:47:13.453563 543705 event_worker.go:152] Polling the log file for events...
W0319 15:47:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:47:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0319 15:47:14.455170 543705 disk_worker.go:728] disk inode is not compliant
E0319 15:47:14.456924 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 15:47:14.456934 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 15:47:14.456940 543705 custom_config.go:64] query custom config with name: gpu
I0319 15:47:14.457018 543705 disk_worker.go:494] system disk:vda1
I0319 15:47:14.457060 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 15:47:15.456852 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 15:47:15.456861 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:47:16.457944 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 15:47:16.457958 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 15:47:16.457998 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:47:16.458016 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:47:16.472351 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:47:19.816857 543705 disk_info.go:125] begin check local disk info of client
I0319 15:47:19.819275 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:47:19.819282 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003529c0 0xc000352a00]
E0319 15:47:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:47:23.409792 543705 memory.go:184] no items to output this cycle
I0319 15:47:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 15:47:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:47:33.409796 543705 memory.go:184] no items to output this cycle
I0319 15:47:33.409811 543705 cpu.go:275] no items to output this cycle
E0319 15:47:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:47:43.409799 543705 memory.go:191] Add success.
I0319 15:47:43.409817 543705 cpu.go:282] Add success.
I0319 15:47:43.419987 543705 net.go:648] Add success.
I0319 15:47:43.422900 543705 net.go:770] primary dev: ETH0
I0319 15:47:43.422915 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:47:43.422930 543705 net.go:698] Add success.
I0319 15:47:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:47:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:47:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:47:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:47:53.409770 543705 memory.go:184] no items to output this cycle
I0319 15:47:53.409806 543705 cpu.go:275] no items to output this cycle
E0319 15:48:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:48:03.409768 543705 memory.go:184] no items to output this cycle
I0319 15:48:03.409801 543705 cpu.go:275] no items to output this cycle
E0319 15:48:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:48:13.409797 543705 memory.go:191] Add success.
I0319 15:48:13.409799 543705 cpu.go:282] Add success.
W0319 15:48:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:48:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:48:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:48:13.420060 543705 net.go:648] Add success.
I0319 15:48:13.422965 543705 net.go:770] primary dev: ETH0
I0319 15:48:13.422978 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:48:13.422999 543705 net.go:698] Add success.
I0319 15:48:13.549438 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"df6cc837-9db0-4117-89c5-38dce1823315","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 15:48:13.549476 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 15:48:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:48:14.455111 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:48:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0319 15:48:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:48:14.456546 543705 disk_worker.go:494] system disk:vda1
I0319 15:48:14.456596 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:48:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:48:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:48:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:48:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:48:16.472425 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:48:19.820734 543705 disk_info.go:125] begin check local disk info of client
I0319 15:48:19.823177 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:48:19.823183 543705 disk_info.go:196] parse disk info done, disk is : [0xc00027b8c0 0xc00027b900]
E0319 15:48:23.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:48:23.409810 543705 memory.go:184] no items to output this cycle
I0319 15:48:23.409812 543705 cpu.go:275] no items to output this cycle
E0319 15:48:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:48:33.409781 543705 memory.go:184] no items to output this cycle
I0319 15:48:33.409804 543705 cpu.go:275] no items to output this cycle
I0319 15:48:37.756482 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 15:48:37.756488 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 15:48:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:48:43.410729 543705 memory.go:191] Add success.
I0319 15:48:43.409836 543705 cpu.go:282] Add success.
I0319 15:48:43.420468 543705 net.go:648] Add success.
I0319 15:48:43.423150 543705 net.go:770] primary dev: ETH0
I0319 15:48:43.423164 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:48:43.423176 543705 net.go:698] Add success.
I0319 15:48:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:48:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:48:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:48:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:48:53.409769 543705 memory.go:184] no items to output this cycle
I0319 15:48:53.409796 543705 cpu.go:275] no items to output this cycle
E0319 15:49:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:49:03.409788 543705 memory.go:184] no items to output this cycle
I0319 15:49:03.409788 543705 cpu.go:275] no items to output this cycle
E0319 15:49:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:49:13.409790 543705 memory.go:191] Add success.
I0319 15:49:13.409810 543705 cpu.go:282] Add success.
W0319 15:49:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:49:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:49:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:49:13.420233 543705 net.go:648] Add success.
I0319 15:49:13.423025 543705 net.go:770] primary dev: ETH0
I0319 15:49:13.423043 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:49:13.423057 543705 net.go:698] Add success.
I0319 15:49:14.454955 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:49:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:49:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0319 15:49:14.455173 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:49:14.456494 543705 disk_worker.go:494] system disk:vda1
I0319 15:49:14.456540 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:49:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:49:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:49:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:49:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:49:16.472496 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:49:19.824757 543705 disk_info.go:125] begin check local disk info of client
I0319 15:49:19.827216 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:49:19.827223 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004cd680 0xc0004cd6c0]
E0319 15:49:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:49:23.409769 543705 memory.go:184] no items to output this cycle
I0319 15:49:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 15:49:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:49:33.409862 543705 cpu.go:275] no items to output this cycle
I0319 15:49:33.409889 543705 memory.go:184] no items to output this cycle
E0319 15:49:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:49:43.409823 543705 memory.go:191] Add success.
I0319 15:49:43.409828 543705 cpu.go:282] Add success.
I0319 15:49:43.420006 543705 net.go:648] Add success.
I0319 15:49:43.423166 543705 net.go:770] primary dev: ETH0
I0319 15:49:43.423180 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:49:43.423194 543705 net.go:698] Add success.
I0319 15:49:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:49:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:49:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:49:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:49:53.409801 543705 memory.go:184] no items to output this cycle
I0319 15:49:53.409811 543705 cpu.go:275] no items to output this cycle
E0319 15:50:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:50:03.409800 543705 memory.go:184] no items to output this cycle
I0319 15:50:03.409812 543705 cpu.go:275] no items to output this cycle
E0319 15:50:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:50:13.409805 543705 memory.go:191] Add success.
I0319 15:50:13.409815 543705 cpu.go:282] Add success.
W0319 15:50:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:50:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:50:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:50:13.420177 543705 net.go:648] Add success.
I0319 15:50:13.422952 543705 net.go:770] primary dev: ETH0
I0319 15:50:13.422967 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:50:13.422980 543705 net.go:698] Add success.
I0319 15:50:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:50:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:50:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0319 15:50:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:50:14.456598 543705 disk_worker.go:494] system disk:vda1
I0319 15:50:14.456629 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:50:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:50:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:50:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:50:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:50:16.472400 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:50:19.828778 543705 disk_info.go:125] begin check local disk info of client
I0319 15:50:19.831243 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:50:19.831249 543705 disk_info.go:196] parse disk info done, disk is : [0xc000204cc0 0xc000204d00]
E0319 15:50:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:50:23.409774 543705 memory.go:184] no items to output this cycle
I0319 15:50:23.409779 543705 cpu.go:275] no items to output this cycle
E0319 15:50:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:50:33.409779 543705 memory.go:184] no items to output this cycle
I0319 15:50:33.409783 543705 cpu.go:275] no items to output this cycle
E0319 15:50:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:50:43.409798 543705 memory.go:191] Add success.
I0319 15:50:43.409832 543705 cpu.go:282] Add success.
I0319 15:50:43.420065 543705 net.go:648] Add success.
I0319 15:50:43.422828 543705 net.go:770] primary dev: ETH0
I0319 15:50:43.422841 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:50:43.422854 543705 net.go:698] Add success.
I0319 15:50:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:50:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:50:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:50:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:50:53.409780 543705 memory.go:184] no items to output this cycle
I0319 15:50:53.409789 543705 cpu.go:275] no items to output this cycle
E0319 15:51:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:51:03.409777 543705 memory.go:184] no items to output this cycle
I0319 15:51:03.409782 543705 cpu.go:275] no items to output this cycle
E0319 15:51:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:51:13.409799 543705 memory.go:191] Add success.
I0319 15:51:13.409800 543705 cpu.go:282] Add success.
W0319 15:51:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:51:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:51:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:51:13.420188 543705 net.go:648] Add success.
I0319 15:51:13.423220 543705 net.go:770] primary dev: ETH0
I0319 15:51:13.423233 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:51:13.423245 543705 net.go:698] Add success.
I0319 15:51:13.468206 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"bb5e6b89-4f42-4225-9d02-6691d8b5f565","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 15:51:13.468242 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 15:51:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:51:14.455204 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:51:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0319 15:51:14.455217 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:51:14.456829 543705 disk_worker.go:494] system disk:vda1
I0319 15:51:14.456860 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:51:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:51:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:51:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:51:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:51:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:51:19.832787 543705 disk_info.go:125] begin check local disk info of client
I0319 15:51:19.835263 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:51:19.835270 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e2240 0xc0003e2280]
E0319 15:51:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:51:23.409764 543705 memory.go:184] no items to output this cycle
I0319 15:51:23.409792 543705 cpu.go:275] no items to output this cycle
E0319 15:51:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:51:33.409770 543705 memory.go:184] no items to output this cycle
I0319 15:51:33.409794 543705 cpu.go:275] no items to output this cycle
I0319 15:51:37.757486 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 15:51:37.757493 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 15:51:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:51:43.410762 543705 memory.go:191] Add success.
I0319 15:51:43.409832 543705 cpu.go:282] Add success.
I0319 15:51:43.420488 543705 net.go:648] Add success.
I0319 15:51:43.423464 543705 net.go:770] primary dev: ETH0
I0319 15:51:43.423476 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:51:43.423489 543705 net.go:698] Add success.
I0319 15:51:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:51:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:51:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:51:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:51:53.409772 543705 memory.go:184] no items to output this cycle
I0319 15:51:53.409807 543705 cpu.go:275] no items to output this cycle
E0319 15:52:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:52:03.409794 543705 memory.go:184] no items to output this cycle
I0319 15:52:03.409804 543705 cpu.go:275] no items to output this cycle
E0319 15:52:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:52:13.409801 543705 memory.go:191] Add success.
I0319 15:52:13.409811 543705 cpu.go:282] Add success.
W0319 15:52:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:52:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:52:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:52:13.420060 543705 net.go:648] Add success.
I0319 15:52:13.422819 543705 net.go:770] primary dev: ETH0
I0319 15:52:13.422834 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:52:13.422845 543705 net.go:698] Add success.
W0319 15:52:14.455193 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:52:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0319 15:52:14.455207 543705 disk_worker.go:728] disk inode is not compliant
E0319 15:52:14.455922 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 15:52:14.455931 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 15:52:14.455938 543705 custom_config.go:64] query custom config with name: gpu
I0319 15:52:14.456580 543705 disk_worker.go:494] system disk:vda1
I0319 15:52:14.456612 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 15:52:15.456813 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 15:52:15.456822 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:52:16.457954 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 15:52:16.457954 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 15:52:16.458010 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:52:16.458033 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:52:16.472368 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:52:19.836814 543705 disk_info.go:125] begin check local disk info of client
I0319 15:52:19.839215 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:52:19.839221 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329a00 0xc000329a40]
E0319 15:52:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:52:23.409790 543705 memory.go:184] no items to output this cycle
I0319 15:52:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 15:52:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:52:33.409900 543705 cpu.go:275] no items to output this cycle
I0319 15:52:33.409906 543705 memory.go:184] no items to output this cycle
E0319 15:52:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:52:43.409828 543705 memory.go:191] Add success.
I0319 15:52:43.409832 543705 cpu.go:282] Add success.
I0319 15:52:43.419877 543705 net.go:770] primary dev: ETH0
I0319 15:52:43.419891 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:52:43.419907 543705 net.go:698] Add success.
I0319 15:52:43.420266 543705 net.go:648] Add success.
I0319 15:52:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:52:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:52:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:52:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:52:53.409784 543705 memory.go:184] no items to output this cycle
I0319 15:52:53.409785 543705 cpu.go:275] no items to output this cycle
E0319 15:53:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:53:03.409799 543705 memory.go:184] no items to output this cycle
I0319 15:53:03.409811 543705 cpu.go:275] no items to output this cycle
E0319 15:53:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:53:13.409791 543705 memory.go:191] Add success.
I0319 15:53:13.409809 543705 cpu.go:282] Add success.
W0319 15:53:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:53:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:53:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:53:13.420230 543705 net.go:648] Add success.
I0319 15:53:13.423348 543705 net.go:770] primary dev: ETH0
I0319 15:53:13.423361 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:53:13.423373 543705 net.go:698] Add success.
I0319 15:53:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:53:14.455166 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:53:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0319 15:53:14.455180 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:53:14.456506 543705 disk_worker.go:494] system disk:vda1
I0319 15:53:14.456552 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:53:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:53:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:53:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:53:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:53:16.472390 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:53:19.840842 543705 disk_info.go:125] begin check local disk info of client
I0319 15:53:19.843266 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:53:19.843272 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000e5c00 0xc0000e5c40]
E0319 15:53:23.410387 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:53:23.410401 543705 memory.go:184] no items to output this cycle
I0319 15:53:23.410430 543705 cpu.go:275] no items to output this cycle
E0319 15:53:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:53:33.409777 543705 memory.go:184] no items to output this cycle
I0319 15:53:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 15:53:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:53:43.409802 543705 memory.go:191] Add success.
I0319 15:53:43.409821 543705 cpu.go:282] Add success.
I0319 15:53:43.420068 543705 net.go:648] Add success.
I0319 15:53:43.423156 543705 net.go:770] primary dev: ETH0
I0319 15:53:43.423169 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:53:43.423181 543705 net.go:698] Add success.
I0319 15:53:46.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:53:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:53:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:53:53.410200 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:53:53.410216 543705 memory.go:184] no items to output this cycle
I0319 15:53:53.410235 543705 cpu.go:275] no items to output this cycle
E0319 15:54:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:54:03.409789 543705 memory.go:184] no items to output this cycle
I0319 15:54:03.409794 543705 cpu.go:275] no items to output this cycle
E0319 15:54:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:54:13.409794 543705 memory.go:191] Add success.
W0319 15:54:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 15:54:13.409827 543705 cpu.go:282] Add success.
W0319 15:54:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:54:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:54:13.420200 543705 net.go:648] Add success.
I0319 15:54:13.422696 543705 net.go:770] primary dev: ETH0
I0319 15:54:13.422716 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:54:13.422728 543705 net.go:698] Add success.
I0319 15:54:13.463162 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"679f03f7-203c-41cc-a624-acd7e2ac3046","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 15:54:13.463196 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 15:54:14.454980 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:54:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:54:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0319 15:54:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:54:14.456515 543705 disk_worker.go:494] system disk:vda1
I0319 15:54:14.456568 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:54:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:54:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:54:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:54:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:54:16.472398 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:54:19.843355 543705 disk_info.go:125] begin check local disk info of client
I0319 15:54:19.845817 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:54:19.845823 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329e80 0xc000329ec0]
E0319 15:54:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:54:23.409776 543705 memory.go:184] no items to output this cycle
I0319 15:54:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 15:54:33.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:54:33.409821 543705 memory.go:184] no items to output this cycle
I0319 15:54:33.409832 543705 cpu.go:275] no items to output this cycle
I0319 15:54:37.757733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 15:54:37.757750 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
I0319 15:54:43.409921 543705 cpu.go:282] Add success.
E0319 15:54:43.409958 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:54:43.410942 543705 memory.go:191] Add success.
I0319 15:54:43.419744 543705 net.go:648] Add success.
I0319 15:54:43.422377 543705 net.go:770] primary dev: ETH0
I0319 15:54:43.422396 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:54:43.422411 543705 net.go:698] Add success.
I0319 15:54:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:54:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:54:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:54:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:54:53.409808 543705 memory.go:184] no items to output this cycle
I0319 15:54:53.409820 543705 cpu.go:275] no items to output this cycle
E0319 15:55:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:55:03.409804 543705 memory.go:184] no items to output this cycle
I0319 15:55:03.409815 543705 cpu.go:275] no items to output this cycle
E0319 15:55:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:55:13.409817 543705 cpu.go:282] Add success.
I0319 15:55:13.409819 543705 memory.go:191] Add success.
W0319 15:55:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:55:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:55:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:55:13.420157 543705 net.go:648] Add success.
I0319 15:55:13.422890 543705 net.go:770] primary dev: ETH0
I0319 15:55:13.422903 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:55:13.422915 543705 net.go:698] Add success.
I0319 15:55:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:55:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:55:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0319 15:55:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:55:14.456612 543705 disk_worker.go:494] system disk:vda1
I0319 15:55:14.456643 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:55:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:55:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:55:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:55:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:55:16.472399 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:55:19.847862 543705 disk_info.go:125] begin check local disk info of client
I0319 15:55:19.850261 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:55:19.850268 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bd7c0 0xc0002bd800]
E0319 15:55:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:55:23.409791 543705 memory.go:184] no items to output this cycle
I0319 15:55:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 15:55:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:55:33.409799 543705 memory.go:184] no items to output this cycle
I0319 15:55:33.409810 543705 cpu.go:275] no items to output this cycle
E0319 15:55:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:55:43.409787 543705 memory.go:191] Add success.
I0319 15:55:43.409817 543705 cpu.go:282] Add success.
I0319 15:55:43.419977 543705 net.go:648] Add success.
I0319 15:55:43.422881 543705 net.go:770] primary dev: ETH0
I0319 15:55:43.422894 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:55:43.422906 543705 net.go:698] Add success.
I0319 15:55:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:55:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:55:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:55:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:55:53.409773 543705 memory.go:184] no items to output this cycle
I0319 15:55:53.409805 543705 cpu.go:275] no items to output this cycle
E0319 15:56:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:56:03.409767 543705 memory.go:184] no items to output this cycle
I0319 15:56:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 15:56:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:56:13.409822 543705 memory.go:191] Add success.
I0319 15:56:13.409842 543705 cpu.go:282] Add success.
W0319 15:56:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:56:13.409873 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:56:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:56:13.420203 543705 net.go:648] Add success.
I0319 15:56:13.422922 543705 net.go:770] primary dev: ETH0
I0319 15:56:13.422934 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:56:13.422947 543705 net.go:698] Add success.
I0319 15:56:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:56:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:56:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0319 15:56:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:56:14.456577 543705 disk_worker.go:494] system disk:vda1
I0319 15:56:14.456606 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:56:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:56:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:56:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:56:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:56:16.472418 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:56:19.851883 543705 disk_info.go:125] begin check local disk info of client
I0319 15:56:19.854334 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:56:19.854341 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329400 0xc000329440]
E0319 15:56:23.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:56:23.409763 543705 memory.go:184] no items to output this cycle
I0319 15:56:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 15:56:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:56:33.409771 543705 memory.go:184] no items to output this cycle
I0319 15:56:33.409805 543705 cpu.go:275] no items to output this cycle
E0319 15:56:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:56:43.409797 543705 memory.go:191] Add success.
I0319 15:56:43.409800 543705 cpu.go:282] Add success.
I0319 15:56:43.419880 543705 net.go:648] Add success.
I0319 15:56:43.422784 543705 net.go:770] primary dev: ETH0
I0319 15:56:43.422804 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:56:43.422819 543705 net.go:698] Add success.
I0319 15:56:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:56:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:56:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:56:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:56:53.409798 543705 memory.go:184] no items to output this cycle
I0319 15:56:53.409810 543705 cpu.go:275] no items to output this cycle
E0319 15:57:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:57:03.409809 543705 memory.go:184] no items to output this cycle
I0319 15:57:03.409824 543705 cpu.go:275] no items to output this cycle
E0319 15:57:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:57:13.409790 543705 memory.go:191] Add success.
W0319 15:57:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 15:57:13.409824 543705 cpu.go:282] Add success.
W0319 15:57:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:57:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:57:13.420206 543705 net.go:648] Add success.
I0319 15:57:13.423084 543705 net.go:770] primary dev: ETH0
I0319 15:57:13.423099 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:57:13.423113 543705 net.go:698] Add success.
I0319 15:57:13.429427 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 15:57:13.453618 543705 event_worker.go:152] Polling the log file for events...
I0319 15:57:13.464327 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"272ddf84-f77b-4a8b-8b46-352305f92fb5","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 15:57:13.464363 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 15:57:14.455167 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:57:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0319 15:57:14.455180 543705 disk_worker.go:728] disk inode is not compliant
E0319 15:57:14.456135 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 15:57:14.456145 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 15:57:14.456151 543705 custom_config.go:64] query custom config with name: gpu
I0319 15:57:14.456512 543705 disk_worker.go:494] system disk:vda1
I0319 15:57:14.456541 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 15:57:15.456795 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 15:57:15.456804 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:57:16.457946 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 15:57:16.457946 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 15:57:16.458011 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:57:16.458030 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:57:16.472454 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:57:19.855910 543705 disk_info.go:125] begin check local disk info of client
I0319 15:57:19.858298 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:57:19.858303 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b6540 0xc0003b6580]
E0319 15:57:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:57:23.409761 543705 memory.go:184] no items to output this cycle
I0319 15:57:23.409790 543705 cpu.go:275] no items to output this cycle
E0319 15:57:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:57:33.409775 543705 memory.go:184] no items to output this cycle
I0319 15:57:33.409783 543705 cpu.go:275] no items to output this cycle
I0319 15:57:37.759496 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 15:57:37.759503 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 15:57:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:57:43.410735 543705 memory.go:191] Add success.
I0319 15:57:43.409807 543705 cpu.go:282] Add success.
I0319 15:57:43.420450 543705 net.go:648] Add success.
I0319 15:57:43.423195 543705 net.go:770] primary dev: ETH0
I0319 15:57:43.423210 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:57:43.423223 543705 net.go:698] Add success.
I0319 15:57:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:57:46.458064 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:57:46.458090 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:57:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:57:53.409801 543705 memory.go:184] no items to output this cycle
I0319 15:57:53.409812 543705 cpu.go:275] no items to output this cycle
E0319 15:58:03.409873 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:58:03.409892 543705 memory.go:184] no items to output this cycle
I0319 15:58:03.409921 543705 cpu.go:275] no items to output this cycle
E0319 15:58:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:58:13.409801 543705 memory.go:191] Add success.
I0319 15:58:13.409812 543705 cpu.go:282] Add success.
W0319 15:58:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:58:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:58:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:58:13.420300 543705 net.go:648] Add success.
I0319 15:58:13.422802 543705 net.go:770] primary dev: ETH0
I0319 15:58:13.422815 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:58:13.422829 543705 net.go:698] Add success.
I0319 15:58:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:58:14.455137 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:58:14.455218 543705 disk_worker.go:708] disk space is not compliant
W0319 15:58:14.455222 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:58:14.456611 543705 disk_worker.go:494] system disk:vda1
I0319 15:58:14.456641 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:58:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:58:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:58:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:58:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:58:16.472409 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:58:19.859927 543705 disk_info.go:125] begin check local disk info of client
I0319 15:58:19.862336 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:58:19.862342 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa6c0 0xc0001aa700]
E0319 15:58:23.410228 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:58:23.410242 543705 memory.go:184] no items to output this cycle
I0319 15:58:23.410275 543705 cpu.go:275] no items to output this cycle
E0319 15:58:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:58:33.409779 543705 memory.go:184] no items to output this cycle
I0319 15:58:33.409784 543705 cpu.go:275] no items to output this cycle
E0319 15:58:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:58:43.409789 543705 memory.go:191] Add success.
I0319 15:58:43.409806 543705 cpu.go:282] Add success.
I0319 15:58:43.419972 543705 net.go:648] Add success.
I0319 15:58:43.422979 543705 net.go:770] primary dev: ETH0
I0319 15:58:43.422993 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:58:43.423006 543705 net.go:698] Add success.
I0319 15:58:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:58:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:58:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:58:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:58:53.409773 543705 memory.go:184] no items to output this cycle
I0319 15:58:53.409800 543705 cpu.go:275] no items to output this cycle
E0319 15:59:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:59:03.409797 543705 memory.go:184] no items to output this cycle
I0319 15:59:03.409809 543705 cpu.go:275] no items to output this cycle
E0319 15:59:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:59:13.409805 543705 memory.go:191] Add success.
I0319 15:59:13.409823 543705 cpu.go:282] Add success.
W0319 15:59:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 15:59:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 15:59:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 15:59:13.420223 543705 net.go:648] Add success.
I0319 15:59:13.423388 543705 net.go:770] primary dev: ETH0
I0319 15:59:13.423402 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:59:13.423416 543705 net.go:698] Add success.
I0319 15:59:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 15:59:14.455210 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 15:59:14.455220 543705 disk_worker.go:708] disk space is not compliant
W0319 15:59:14.455222 543705 disk_worker.go:728] disk inode is not compliant
I0319 15:59:14.456606 543705 disk_worker.go:494] system disk:vda1
I0319 15:59:14.456637 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 15:59:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 15:59:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:59:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:59:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 15:59:16.472404 543705 disk_local_worker.go:436] Get disk info: []
I0319 15:59:19.863956 543705 disk_info.go:125] begin check local disk info of client
I0319 15:59:19.866425 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 15:59:19.866431 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004d9980 0xc0004d99c0]
E0319 15:59:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:59:23.409795 543705 memory.go:184] no items to output this cycle
I0319 15:59:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 15:59:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:59:33.409766 543705 memory.go:184] no items to output this cycle
I0319 15:59:33.409799 543705 cpu.go:275] no items to output this cycle
E0319 15:59:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:59:43.409798 543705 memory.go:191] Add success.
I0319 15:59:43.409806 543705 cpu.go:282] Add success.
I0319 15:59:43.419967 543705 net.go:648] Add success.
I0319 15:59:43.422613 543705 net.go:770] primary dev: ETH0
I0319 15:59:43.422626 543705 net.go:802] Send network stats successfully!,count is 6
I0319 15:59:43.422640 543705 net.go:698] Add success.
I0319 15:59:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 15:59:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 15:59:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 15:59:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 15:59:53.409798 543705 memory.go:184] no items to output this cycle
I0319 15:59:53.409810 543705 cpu.go:275] no items to output this cycle
E0319 16:00:03.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:00:03.409767 543705 memory.go:184] no items to output this cycle
I0319 16:00:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 16:00:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:00:13.409807 543705 memory.go:191] Add success.
I0319 16:00:13.409809 543705 cpu.go:282] Add success.
W0319 16:00:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:00:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:00:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:00:13.420605 543705 net.go:648] Add success.
I0319 16:00:13.423404 543705 net.go:770] primary dev: ETH0
I0319 16:00:13.423416 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:00:13.423429 543705 net.go:698] Add success.
I0319 16:00:13.469071 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"40a0bc7a-aae7-456f-94d6-da19590d72ff","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 16:00:13.469103 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 16:00:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:00:14.455107 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:00:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0319 16:00:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:00:14.456558 543705 disk_worker.go:494] system disk:vda1
I0319 16:00:14.456612 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:00:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:00:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:00:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:00:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:00:16.472429 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:00:19.867970 543705 disk_info.go:125] begin check local disk info of client
I0319 16:00:19.870405 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:00:19.870411 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa640 0xc0001aa680]
E0319 16:00:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:00:23.409796 543705 memory.go:184] no items to output this cycle
I0319 16:00:23.409809 543705 cpu.go:275] no items to output this cycle
E0319 16:00:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:00:33.409777 543705 memory.go:184] no items to output this cycle
I0319 16:00:33.409811 543705 cpu.go:275] no items to output this cycle
I0319 16:00:37.759640 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 16:00:37.759647 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 16:00:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:00:43.410654 543705 memory.go:191] Add success.
I0319 16:00:43.409830 543705 cpu.go:282] Add success.
I0319 16:00:43.420355 543705 net.go:648] Add success.
I0319 16:00:43.423159 543705 net.go:770] primary dev: ETH0
I0319 16:00:43.423174 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:00:43.423187 543705 net.go:698] Add success.
I0319 16:00:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:00:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:00:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:00:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:00:53.409776 543705 memory.go:184] no items to output this cycle
I0319 16:00:53.409781 543705 cpu.go:275] no items to output this cycle
E0319 16:01:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:01:03.409766 543705 memory.go:184] no items to output this cycle
I0319 16:01:03.409802 543705 cpu.go:275] no items to output this cycle
E0319 16:01:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:01:13.409794 543705 memory.go:191] Add success.
W0319 16:01:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 16:01:13.409826 543705 cpu.go:282] Add success.
W0319 16:01:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:01:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:01:13.420137 543705 net.go:648] Add success.
I0319 16:01:13.423201 543705 net.go:770] primary dev: ETH0
I0319 16:01:13.423214 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:01:13.423229 543705 net.go:698] Add success.
I0319 16:01:14.454946 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:01:14.455108 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:01:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0319 16:01:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:01:14.456579 543705 disk_worker.go:494] system disk:vda1
I0319 16:01:14.456608 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:01:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:01:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:01:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:01:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:01:16.472393 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:01:19.870492 543705 disk_info.go:125] begin check local disk info of client
I0319 16:01:19.872957 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:01:19.872964 543705 disk_info.go:196] parse disk info done, disk is : [0xc000492340 0xc000492380]
E0319 16:01:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:01:23.409793 543705 memory.go:184] no items to output this cycle
I0319 16:01:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 16:01:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:01:33.409774 543705 memory.go:184] no items to output this cycle
I0319 16:01:33.409803 543705 cpu.go:275] no items to output this cycle
E0319 16:01:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:01:43.409804 543705 memory.go:191] Add success.
I0319 16:01:43.409816 543705 cpu.go:282] Add success.
I0319 16:01:43.419891 543705 net.go:648] Add success.
I0319 16:01:43.422489 543705 net.go:770] primary dev: ETH0
I0319 16:01:43.422503 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:01:43.422517 543705 net.go:698] Add success.
I0319 16:01:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:01:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:01:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:01:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:01:53.409773 543705 memory.go:184] no items to output this cycle
I0319 16:01:53.409784 543705 cpu.go:275] no items to output this cycle
E0319 16:02:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:02:03.409776 543705 memory.go:184] no items to output this cycle
I0319 16:02:03.409782 543705 cpu.go:275] no items to output this cycle
W0319 16:02:13.409718 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:02:13.409736 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:02:13.409741 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:02:13.409809 543705 cpu.go:282] Add success.
E0319 16:02:13.409841 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:02:13.409863 543705 memory.go:191] Add success.
I0319 16:02:13.420179 543705 net.go:648] Add success.
I0319 16:02:13.422898 543705 net.go:770] primary dev: ETH0
I0319 16:02:13.422913 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:02:13.422927 543705 net.go:698] Add success.
W0319 16:02:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:02:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0319 16:02:14.455183 543705 disk_worker.go:728] disk inode is not compliant
E0319 16:02:14.456184 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 16:02:14.456193 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 16:02:14.456200 543705 custom_config.go:64] query custom config with name: gpu
I0319 16:02:14.456451 543705 disk_worker.go:494] system disk:vda1
I0319 16:02:14.456480 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 16:02:15.456890 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 16:02:15.456898 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:02:16.457945 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 16:02:16.457948 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 16:02:16.458000 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:02:16.458020 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:02:16.472344 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:02:19.873673 543705 disk_info.go:125] begin check local disk info of client
I0319 16:02:19.876036 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:02:19.876043 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ee0c0 0xc0003ee100]
E0319 16:02:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:02:23.409762 543705 memory.go:184] no items to output this cycle
I0319 16:02:23.409781 543705 cpu.go:275] no items to output this cycle
E0319 16:02:33.410279 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:02:33.410297 543705 memory.go:184] no items to output this cycle
I0319 16:02:33.410312 543705 cpu.go:275] no items to output this cycle
E0319 16:02:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:02:43.409799 543705 memory.go:191] Add success.
I0319 16:02:43.409804 543705 cpu.go:282] Add success.
I0319 16:02:43.420032 543705 net.go:648] Add success.
I0319 16:02:43.422962 543705 net.go:770] primary dev: ETH0
I0319 16:02:43.422975 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:02:43.422988 543705 net.go:698] Add success.
I0319 16:02:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:02:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:02:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:02:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:02:53.409769 543705 memory.go:184] no items to output this cycle
I0319 16:02:53.409792 543705 cpu.go:275] no items to output this cycle
E0319 16:03:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:03:03.409799 543705 memory.go:184] no items to output this cycle
I0319 16:03:03.409811 543705 cpu.go:275] no items to output this cycle
E0319 16:03:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:03:13.409797 543705 memory.go:191] Add success.
I0319 16:03:13.409820 543705 cpu.go:282] Add success.
W0319 16:03:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:03:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:03:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:03:13.420202 543705 net.go:648] Add success.
I0319 16:03:13.422910 543705 net.go:770] primary dev: ETH0
I0319 16:03:13.422924 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:03:13.422939 543705 net.go:698] Add success.
I0319 16:03:13.726809 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c0bd437e-3483-4219-ad59-85c24d3f0a0c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 16:03:13.726846 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 16:03:14.454500 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:03:14.454748 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:03:14.454760 543705 disk_worker.go:708] disk space is not compliant
W0319 16:03:14.454762 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:03:14.456445 543705 disk_worker.go:494] system disk:vda1
I0319 16:03:14.456486 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:03:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:03:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:03:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:03:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:03:16.472398 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:03:19.877675 543705 disk_info.go:125] begin check local disk info of client
I0319 16:03:19.880056 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:03:19.880061 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4200 0xc0000c4240]
E0319 16:03:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:03:23.409775 543705 memory.go:184] no items to output this cycle
I0319 16:03:23.409789 543705 cpu.go:275] no items to output this cycle
E0319 16:03:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:03:33.409781 543705 memory.go:184] no items to output this cycle
I0319 16:03:33.409799 543705 cpu.go:275] no items to output this cycle
I0319 16:03:37.759785 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 16:03:37.759792 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 16:03:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:03:43.410618 543705 memory.go:191] Add success.
I0319 16:03:43.409823 543705 cpu.go:282] Add success.
I0319 16:03:43.420387 543705 net.go:648] Add success.
I0319 16:03:43.423156 543705 net.go:770] primary dev: ETH0
I0319 16:03:43.423172 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:03:43.423186 543705 net.go:698] Add success.
I0319 16:03:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:03:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:03:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:03:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:03:53.409777 543705 memory.go:184] no items to output this cycle
I0319 16:03:53.409805 543705 cpu.go:275] no items to output this cycle
E0319 16:04:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:04:03.409774 543705 memory.go:184] no items to output this cycle
I0319 16:04:03.409796 543705 cpu.go:275] no items to output this cycle
E0319 16:04:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:04:13.409791 543705 memory.go:191] Add success.
W0319 16:04:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 16:04:13.409822 543705 cpu.go:282] Add success.
W0319 16:04:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:04:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:04:13.420192 543705 net.go:648] Add success.
I0319 16:04:13.423201 543705 net.go:770] primary dev: ETH0
I0319 16:04:13.423214 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:04:13.423225 543705 net.go:698] Add success.
I0319 16:04:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:04:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:04:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 16:04:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:04:14.456635 543705 disk_worker.go:494] system disk:vda1
I0319 16:04:14.456667 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:04:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:04:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:04:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:04:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:04:16.472399 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:04:19.881680 543705 disk_info.go:125] begin check local disk info of client
I0319 16:04:19.884039 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:04:19.884045 543705 disk_info.go:196] parse disk info done, disk is : [0xc000348f80 0xc000348fc0]
I0319 16:04:23.409870 543705 cpu.go:275] no items to output this cycle
E0319 16:04:23.409911 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:04:23.409928 543705 memory.go:184] no items to output this cycle
E0319 16:04:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:04:33.409810 543705 memory.go:184] no items to output this cycle
I0319 16:04:33.409823 543705 cpu.go:275] no items to output this cycle
E0319 16:04:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:04:43.409827 543705 memory.go:191] Add success.
I0319 16:04:43.409837 543705 cpu.go:282] Add success.
I0319 16:04:43.420025 543705 net.go:648] Add success.
I0319 16:04:43.422750 543705 net.go:770] primary dev: ETH0
I0319 16:04:43.422763 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:04:43.422777 543705 net.go:698] Add success.
I0319 16:04:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:04:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:04:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:04:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:04:53.409772 543705 memory.go:184] no items to output this cycle
I0319 16:04:53.409806 543705 cpu.go:275] no items to output this cycle
E0319 16:05:03.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:05:03.409811 543705 memory.go:184] no items to output this cycle
I0319 16:05:03.409821 543705 cpu.go:275] no items to output this cycle
E0319 16:05:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:05:13.409801 543705 memory.go:191] Add success.
I0319 16:05:13.409825 543705 cpu.go:282] Add success.
W0319 16:05:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:05:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:05:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:05:13.420161 543705 net.go:648] Add success.
I0319 16:05:13.423081 543705 net.go:770] primary dev: ETH0
I0319 16:05:13.423094 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:05:13.423106 543705 net.go:698] Add success.
I0319 16:05:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:05:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:05:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0319 16:05:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:05:14.456604 543705 disk_worker.go:494] system disk:vda1
I0319 16:05:14.456634 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:05:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:05:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:05:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:05:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:05:16.472385 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:05:19.885672 543705 disk_info.go:125] begin check local disk info of client
I0319 16:05:19.888068 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:05:19.888075 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047d500 0xc00047d540]
E0319 16:05:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:05:23.409899 543705 memory.go:184] no items to output this cycle
I0319 16:05:23.409928 543705 cpu.go:275] no items to output this cycle
E0319 16:05:33.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:05:33.409818 543705 memory.go:184] no items to output this cycle
I0319 16:05:33.409827 543705 cpu.go:275] no items to output this cycle
E0319 16:05:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:05:43.409799 543705 memory.go:191] Add success.
I0319 16:05:43.409810 543705 cpu.go:282] Add success.
I0319 16:05:43.420056 543705 net.go:648] Add success.
I0319 16:05:43.423111 543705 net.go:770] primary dev: ETH0
I0319 16:05:43.423123 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:05:43.423136 543705 net.go:698] Add success.
I0319 16:05:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:05:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:05:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:05:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:05:53.409782 543705 memory.go:184] no items to output this cycle
I0319 16:05:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 16:06:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:06:03.409808 543705 memory.go:184] no items to output this cycle
I0319 16:06:03.409820 543705 cpu.go:275] no items to output this cycle
E0319 16:06:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:06:13.409802 543705 memory.go:191] Add success.
W0319 16:06:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 16:06:13.409839 543705 cpu.go:282] Add success.
W0319 16:06:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:06:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:06:13.420142 543705 net.go:648] Add success.
I0319 16:06:13.423151 543705 net.go:770] primary dev: ETH0
I0319 16:06:13.423164 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:06:13.423175 543705 net.go:698] Add success.
I0319 16:06:13.464445 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0d1fbad2-9a76-4076-83f5-6f36baecbb78","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 16:06:13.464478 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 16:06:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:06:14.455169 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:06:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0319 16:06:14.455182 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:06:14.456553 543705 disk_worker.go:494] system disk:vda1
I0319 16:06:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:06:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:06:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:06:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:06:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:06:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:06:19.889672 543705 disk_info.go:125] begin check local disk info of client
I0319 16:06:19.892103 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:06:19.892109 543705 disk_info.go:196] parse disk info done, disk is : [0xc000286cc0 0xc000286d00]
E0319 16:06:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:06:23.409907 543705 memory.go:184] no items to output this cycle
I0319 16:06:23.409914 543705 cpu.go:275] no items to output this cycle
E0319 16:06:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:06:33.409806 543705 memory.go:184] no items to output this cycle
I0319 16:06:33.409831 543705 cpu.go:275] no items to output this cycle
I0319 16:06:37.761502 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 16:06:37.761509 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 16:06:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:06:43.410726 543705 memory.go:191] Add success.
I0319 16:06:43.409810 543705 cpu.go:282] Add success.
I0319 16:06:43.420464 543705 net.go:648] Add success.
I0319 16:06:43.423212 543705 net.go:770] primary dev: ETH0
I0319 16:06:43.423226 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:06:43.423239 543705 net.go:698] Add success.
I0319 16:06:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:06:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:06:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:06:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:06:53.409779 543705 memory.go:184] no items to output this cycle
I0319 16:06:53.409786 543705 cpu.go:275] no items to output this cycle
E0319 16:07:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:07:03.409777 543705 memory.go:184] no items to output this cycle
I0319 16:07:03.409791 543705 cpu.go:275] no items to output this cycle
E0319 16:07:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:07:13.409791 543705 memory.go:191] Add success.
I0319 16:07:13.409819 543705 cpu.go:282] Add success.
W0319 16:07:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:07:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:07:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:07:13.420327 543705 net.go:648] Add success.
I0319 16:07:13.423325 543705 net.go:770] primary dev: ETH0
I0319 16:07:13.423340 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:07:13.423352 543705 net.go:698] Add success.
I0319 16:07:13.453035 543705 event_worker.go:152] Polling the log file for events...
W0319 16:07:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:07:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0319 16:07:14.455177 543705 disk_worker.go:728] disk inode is not compliant
E0319 16:07:14.456996 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 16:07:14.457006 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 16:07:14.457013 543705 custom_config.go:64] query custom config with name: gpu
I0319 16:07:14.457055 543705 disk_worker.go:494] system disk:vda1
I0319 16:07:14.457084 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 16:07:15.456801 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 16:07:15.456809 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:07:16.457900 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 16:07:16.457900 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 16:07:16.457953 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:07:16.457973 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:07:16.472271 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:07:19.893674 543705 disk_info.go:125] begin check local disk info of client
I0319 16:07:19.896037 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:07:19.896043 543705 disk_info.go:196] parse disk info done, disk is : [0xc000358880 0xc0003588c0]
E0319 16:07:23.410208 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:07:23.410223 543705 memory.go:184] no items to output this cycle
I0319 16:07:23.410237 543705 cpu.go:275] no items to output this cycle
E0319 16:07:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:07:33.409790 543705 memory.go:184] no items to output this cycle
I0319 16:07:33.409795 543705 cpu.go:275] no items to output this cycle
E0319 16:07:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:07:43.409775 543705 memory.go:191] Add success.
I0319 16:07:43.409818 543705 cpu.go:282] Add success.
I0319 16:07:43.420058 543705 net.go:648] Add success.
I0319 16:07:43.422910 543705 net.go:770] primary dev: ETH0
I0319 16:07:43.422935 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:07:43.422948 543705 net.go:698] Add success.
I0319 16:07:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:07:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:07:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:07:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:07:53.409784 543705 memory.go:184] no items to output this cycle
I0319 16:07:53.409788 543705 cpu.go:275] no items to output this cycle
E0319 16:08:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:08:03.409770 543705 memory.go:184] no items to output this cycle
I0319 16:08:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 16:08:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:08:13.409802 543705 memory.go:191] Add success.
I0319 16:08:13.409822 543705 cpu.go:282] Add success.
W0319 16:08:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:08:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:08:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:08:13.420253 543705 net.go:648] Add success.
I0319 16:08:13.422938 543705 net.go:770] primary dev: ETH0
I0319 16:08:13.422951 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:08:13.422962 543705 net.go:698] Add success.
I0319 16:08:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:08:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:08:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0319 16:08:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:08:14.456599 543705 disk_worker.go:494] system disk:vda1
I0319 16:08:14.456647 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:08:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:08:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:08:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:08:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:08:16.472390 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:08:19.897673 543705 disk_info.go:125] begin check local disk info of client
I0319 16:08:19.899988 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:08:19.899994 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035f540 0xc00035f580]
E0319 16:08:23.409741 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:08:23.409754 543705 memory.go:184] no items to output this cycle
I0319 16:08:23.409791 543705 cpu.go:275] no items to output this cycle
E0319 16:08:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:08:33.409808 543705 memory.go:184] no items to output this cycle
I0319 16:08:33.409821 543705 cpu.go:275] no items to output this cycle
E0319 16:08:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:08:43.409804 543705 memory.go:191] Add success.
I0319 16:08:43.409828 543705 cpu.go:282] Add success.
I0319 16:08:43.420090 543705 net.go:648] Add success.
I0319 16:08:43.422633 543705 net.go:770] primary dev: ETH0
I0319 16:08:43.422659 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:08:43.422674 543705 net.go:698] Add success.
I0319 16:08:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:08:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:08:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:08:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:08:53.409800 543705 memory.go:184] no items to output this cycle
I0319 16:08:53.409810 543705 cpu.go:275] no items to output this cycle
E0319 16:09:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:09:03.409781 543705 memory.go:184] no items to output this cycle
I0319 16:09:03.409808 543705 cpu.go:275] no items to output this cycle
E0319 16:09:13.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:09:13.409838 543705 memory.go:191] Add success.
I0319 16:09:13.409846 543705 cpu.go:282] Add success.
W0319 16:09:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:09:13.409888 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:09:13.409892 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:09:13.420158 543705 net.go:648] Add success.
I0319 16:09:13.423175 543705 net.go:770] primary dev: ETH0
I0319 16:09:13.423191 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:09:13.423207 543705 net.go:698] Add success.
I0319 16:09:13.469092 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"6bfe0ab6-61fd-4e86-b1d1-2e2ce07421da","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 16:09:13.469127 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 16:09:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:09:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:09:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0319 16:09:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:09:14.456729 543705 disk_worker.go:494] system disk:vda1
I0319 16:09:14.456757 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:09:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:09:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:09:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:09:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:09:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:09:19.901201 543705 disk_info.go:125] begin check local disk info of client
I0319 16:09:19.903623 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:09:19.903629 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034c700 0xc00034c740]
E0319 16:09:23.410764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:09:23.410778 543705 memory.go:184] no items to output this cycle
I0319 16:09:23.410785 543705 cpu.go:275] no items to output this cycle
E0319 16:09:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:09:33.409785 543705 memory.go:184] no items to output this cycle
I0319 16:09:33.409808 543705 cpu.go:275] no items to output this cycle
I0319 16:09:37.761728 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 16:09:37.761744 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 16:09:43.409895 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:09:43.410807 543705 memory.go:191] Add success.
I0319 16:09:43.409936 543705 cpu.go:282] Add success.
I0319 16:09:43.419731 543705 net.go:648] Add success.
I0319 16:09:43.423016 543705 net.go:770] primary dev: ETH0
I0319 16:09:43.423029 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:09:43.423041 543705 net.go:698] Add success.
I0319 16:09:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:09:46.458065 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:09:46.458094 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:09:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:09:53.409786 543705 memory.go:184] no items to output this cycle
I0319 16:09:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 16:10:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:10:03.409800 543705 memory.go:184] no items to output this cycle
I0319 16:10:03.409815 543705 cpu.go:275] no items to output this cycle
E0319 16:10:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:10:13.409814 543705 memory.go:191] Add success.
I0319 16:10:13.409815 543705 cpu.go:282] Add success.
W0319 16:10:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:10:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:10:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:10:13.420333 543705 net.go:648] Add success.
I0319 16:10:13.423138 543705 net.go:770] primary dev: ETH0
I0319 16:10:13.423151 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:10:13.423164 543705 net.go:698] Add success.
I0319 16:10:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:10:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:10:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0319 16:10:14.455171 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:10:14.456511 543705 disk_worker.go:494] system disk:vda1
I0319 16:10:14.456555 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:10:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:10:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:10:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:10:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:10:16.472373 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:10:19.905149 543705 disk_info.go:125] begin check local disk info of client
I0319 16:10:19.907566 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:10:19.907572 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003aa040 0xc0003aa080]
E0319 16:10:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:10:23.409812 543705 memory.go:184] no items to output this cycle
I0319 16:10:23.409822 543705 cpu.go:275] no items to output this cycle
E0319 16:10:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:10:33.409797 543705 memory.go:184] no items to output this cycle
I0319 16:10:33.409811 543705 cpu.go:275] no items to output this cycle
E0319 16:10:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:10:43.409796 543705 memory.go:191] Add success.
I0319 16:10:43.409815 543705 cpu.go:282] Add success.
I0319 16:10:43.420079 543705 net.go:648] Add success.
I0319 16:10:43.422604 543705 net.go:770] primary dev: ETH0
I0319 16:10:43.422617 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:10:43.422629 543705 net.go:698] Add success.
I0319 16:10:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:10:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:10:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:10:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:10:53.409782 543705 memory.go:184] no items to output this cycle
I0319 16:10:53.409783 543705 cpu.go:275] no items to output this cycle
E0319 16:11:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:11:03.409778 543705 memory.go:184] no items to output this cycle
I0319 16:11:03.409783 543705 cpu.go:275] no items to output this cycle
E0319 16:11:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:11:13.409804 543705 memory.go:191] Add success.
I0319 16:11:13.409804 543705 cpu.go:282] Add success.
W0319 16:11:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:11:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:11:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:11:13.420125 543705 net.go:648] Add success.
I0319 16:11:13.422942 543705 net.go:770] primary dev: ETH0
I0319 16:11:13.422955 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:11:13.422967 543705 net.go:698] Add success.
I0319 16:11:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:11:14.455202 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:11:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0319 16:11:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:11:14.456587 543705 disk_worker.go:494] system disk:vda1
I0319 16:11:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:11:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:11:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:11:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:11:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:11:16.472412 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:11:19.909227 543705 disk_info.go:125] begin check local disk info of client
I0319 16:11:19.911632 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:11:19.911638 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b7dc0 0xc0003b7e00]
E0319 16:11:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:11:23.409800 543705 memory.go:184] no items to output this cycle
I0319 16:11:23.409810 543705 cpu.go:275] no items to output this cycle
E0319 16:11:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:11:33.409770 543705 memory.go:184] no items to output this cycle
I0319 16:11:33.409796 543705 cpu.go:275] no items to output this cycle
E0319 16:11:43.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:11:43.409830 543705 memory.go:191] Add success.
I0319 16:11:43.409837 543705 cpu.go:282] Add success.
I0319 16:11:43.420008 543705 net.go:648] Add success.
I0319 16:11:43.422645 543705 net.go:770] primary dev: ETH0
I0319 16:11:43.422659 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:11:43.422671 543705 net.go:698] Add success.
I0319 16:11:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:11:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:11:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:11:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:11:53.409764 543705 memory.go:184] no items to output this cycle
I0319 16:11:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 16:12:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:12:03.409795 543705 memory.go:184] no items to output this cycle
I0319 16:12:03.409811 543705 cpu.go:275] no items to output this cycle
E0319 16:12:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:12:13.409824 543705 memory.go:191] Add success.
I0319 16:12:13.409834 543705 cpu.go:282] Add success.
W0319 16:12:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:12:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:12:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:12:13.420233 543705 net.go:648] Add success.
I0319 16:12:13.422900 543705 net.go:770] primary dev: ETH0
I0319 16:12:13.422913 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:12:13.422925 543705 net.go:698] Add success.
I0319 16:12:13.463465 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"84746cdd-7652-414d-b89f-e455bbc50cdb","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 16:12:13.463499 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 16:12:14.455134 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:12:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0319 16:12:14.455199 543705 disk_worker.go:728] disk inode is not compliant
E0319 16:12:14.455951 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 16:12:14.455959 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 16:12:14.455964 543705 custom_config.go:64] query custom config with name: gpu
I0319 16:12:14.456573 543705 disk_worker.go:494] system disk:vda1
I0319 16:12:14.456614 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 16:12:15.456804 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 16:12:15.456812 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:12:16.457937 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 16:12:16.457937 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 16:12:16.458009 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:12:16.458029 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:12:16.472374 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:12:19.913195 543705 disk_info.go:125] begin check local disk info of client
I0319 16:12:19.915577 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:12:19.915583 543705 disk_info.go:196] parse disk info done, disk is : [0xc000314ec0 0xc000314f00]
E0319 16:12:23.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:12:23.409813 543705 memory.go:184] no items to output this cycle
I0319 16:12:23.409826 543705 cpu.go:275] no items to output this cycle
E0319 16:12:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:12:33.409794 543705 memory.go:184] no items to output this cycle
I0319 16:12:33.409811 543705 cpu.go:275] no items to output this cycle
I0319 16:12:37.763514 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 16:12:37.763521 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 16:12:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:12:43.410616 543705 memory.go:191] Add success.
I0319 16:12:43.409792 543705 cpu.go:282] Add success.
I0319 16:12:43.420353 543705 net.go:648] Add success.
I0319 16:12:43.422926 543705 net.go:770] primary dev: ETH0
I0319 16:12:43.422939 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:12:43.422951 543705 net.go:698] Add success.
I0319 16:12:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:12:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:12:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:12:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:12:53.409780 543705 cpu.go:275] no items to output this cycle
I0319 16:12:53.409782 543705 memory.go:184] no items to output this cycle
E0319 16:13:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:13:03.409796 543705 memory.go:184] no items to output this cycle
I0319 16:13:03.409811 543705 cpu.go:275] no items to output this cycle
E0319 16:13:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:13:13.409814 543705 memory.go:191] Add success.
I0319 16:13:13.409817 543705 cpu.go:282] Add success.
W0319 16:13:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:13:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:13:13.409859 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:13:13.420168 543705 net.go:648] Add success.
I0319 16:13:13.422849 543705 net.go:770] primary dev: ETH0
I0319 16:13:13.422863 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:13:13.422875 543705 net.go:698] Add success.
I0319 16:13:14.454951 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:13:14.455188 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:13:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0319 16:13:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:13:14.456581 543705 disk_worker.go:494] system disk:vda1
I0319 16:13:14.456613 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:13:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:13:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:13:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:13:16.458076 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:13:16.472415 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:13:19.917207 543705 disk_info.go:125] begin check local disk info of client
I0319 16:13:19.919661 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:13:19.919666 543705 disk_info.go:196] parse disk info done, disk is : [0xc000285400 0xc000285440]
E0319 16:13:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:13:23.409781 543705 memory.go:184] no items to output this cycle
I0319 16:13:23.409788 543705 cpu.go:275] no items to output this cycle
E0319 16:13:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:13:33.409771 543705 memory.go:184] no items to output this cycle
I0319 16:13:33.409806 543705 cpu.go:275] no items to output this cycle
E0319 16:13:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:13:43.409792 543705 memory.go:191] Add success.
I0319 16:13:43.409797 543705 cpu.go:282] Add success.
I0319 16:13:43.419918 543705 net.go:648] Add success.
I0319 16:13:43.422534 543705 net.go:770] primary dev: ETH0
I0319 16:13:43.422548 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:13:43.422560 543705 net.go:698] Add success.
I0319 16:13:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:13:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:13:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:13:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:13:53.409784 543705 memory.go:184] no items to output this cycle
I0319 16:13:53.409789 543705 cpu.go:275] no items to output this cycle
E0319 16:14:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:14:03.409778 543705 memory.go:184] no items to output this cycle
I0319 16:14:03.409780 543705 cpu.go:275] no items to output this cycle
E0319 16:14:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:14:13.409801 543705 memory.go:191] Add success.
I0319 16:14:13.409821 543705 cpu.go:282] Add success.
W0319 16:14:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:14:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:14:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:14:13.420247 543705 net.go:648] Add success.
I0319 16:14:13.422892 543705 net.go:770] primary dev: ETH0
I0319 16:14:13.422904 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:14:13.422915 543705 net.go:698] Add success.
I0319 16:14:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:14:14.455135 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:14:14.455226 543705 disk_worker.go:708] disk space is not compliant
W0319 16:14:14.455230 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:14:14.456629 543705 disk_worker.go:494] system disk:vda1
I0319 16:14:14.456661 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:14:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:14:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:14:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:14:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:14:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:14:19.921677 543705 disk_info.go:125] begin check local disk info of client
I0319 16:14:19.924103 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:14:19.924110 543705 disk_info.go:196] parse disk info done, disk is : [0xc000256c80 0xc000256cc0]
E0319 16:14:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:14:23.409800 543705 memory.go:184] no items to output this cycle
I0319 16:14:23.409811 543705 cpu.go:275] no items to output this cycle
E0319 16:14:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:14:33.409776 543705 memory.go:184] no items to output this cycle
I0319 16:14:33.409798 543705 cpu.go:275] no items to output this cycle
E0319 16:14:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:14:43.409793 543705 memory.go:191] Add success.
I0319 16:14:43.409810 543705 cpu.go:282] Add success.
I0319 16:14:43.420015 543705 net.go:648] Add success.
I0319 16:14:43.422761 543705 net.go:770] primary dev: ETH0
I0319 16:14:43.422775 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:14:43.422787 543705 net.go:698] Add success.
I0319 16:14:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:14:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:14:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:14:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:14:53.409777 543705 memory.go:184] no items to output this cycle
I0319 16:14:53.409778 543705 cpu.go:275] no items to output this cycle
E0319 16:15:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:15:03.409798 543705 memory.go:184] no items to output this cycle
I0319 16:15:03.409808 543705 cpu.go:275] no items to output this cycle
E0319 16:15:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:15:13.409817 543705 memory.go:191] Add success.
I0319 16:15:13.409816 543705 cpu.go:282] Add success.
W0319 16:15:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:15:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:15:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:15:13.420148 543705 net.go:648] Add success.
I0319 16:15:13.422752 543705 net.go:770] primary dev: ETH0
I0319 16:15:13.422768 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:15:13.422782 543705 net.go:698] Add success.
I0319 16:15:13.463802 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c7f799ab-845e-48a5-b75b-ba2d4b9a1f60","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 16:15:13.463835 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 16:15:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:15:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:15:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0319 16:15:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:15:14.456555 543705 disk_worker.go:494] system disk:vda1
I0319 16:15:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:15:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:15:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:15:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:15:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:15:16.472473 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:15:19.925677 543705 disk_info.go:125] begin check local disk info of client
I0319 16:15:19.928124 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:15:19.928131 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a42c0 0xc0002a4300]
E0319 16:15:23.410365 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:15:23.410384 543705 memory.go:184] no items to output this cycle
I0319 16:15:23.410446 543705 cpu.go:275] no items to output this cycle
E0319 16:15:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:15:33.409780 543705 memory.go:184] no items to output this cycle
I0319 16:15:33.409791 543705 cpu.go:275] no items to output this cycle
I0319 16:15:37.764523 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 16:15:37.764530 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 16:15:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:15:43.410664 543705 memory.go:191] Add success.
I0319 16:15:43.409805 543705 cpu.go:282] Add success.
I0319 16:15:43.420367 543705 net.go:648] Add success.
I0319 16:15:43.423094 543705 net.go:770] primary dev: ETH0
I0319 16:15:43.423108 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:15:43.423122 543705 net.go:698] Add success.
I0319 16:15:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:15:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:15:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:15:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:15:53.409787 543705 memory.go:184] no items to output this cycle
I0319 16:15:53.409801 543705 cpu.go:275] no items to output this cycle
E0319 16:16:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:16:03.409797 543705 memory.go:184] no items to output this cycle
I0319 16:16:03.409813 543705 cpu.go:275] no items to output this cycle
E0319 16:16:13.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:16:13.409831 543705 memory.go:191] Add success.
I0319 16:16:13.409840 543705 cpu.go:282] Add success.
W0319 16:16:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:16:13.413003 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:16:13.413008 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:16:13.420662 543705 net.go:648] Add success.
I0319 16:16:13.422418 543705 net.go:770] primary dev: ETH0
I0319 16:16:13.422439 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:16:13.422453 543705 net.go:698] Add success.
I0319 16:16:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:16:14.455208 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:16:14.455220 543705 disk_worker.go:708] disk space is not compliant
W0319 16:16:14.455223 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:16:14.456638 543705 disk_worker.go:494] system disk:vda1
I0319 16:16:14.456672 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:16:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:16:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:16:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:16:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:16:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:16:19.929673 543705 disk_info.go:125] begin check local disk info of client
I0319 16:16:19.932174 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:16:19.932181 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001e2dc0 0xc0001e2e00]
E0319 16:16:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:16:23.409764 543705 memory.go:184] no items to output this cycle
I0319 16:16:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 16:16:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:16:33.409810 543705 memory.go:184] no items to output this cycle
I0319 16:16:33.409824 543705 cpu.go:275] no items to output this cycle
E0319 16:16:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:16:43.409807 543705 memory.go:191] Add success.
I0319 16:16:43.409808 543705 cpu.go:282] Add success.
I0319 16:16:43.419968 543705 net.go:648] Add success.
I0319 16:16:43.423093 543705 net.go:770] primary dev: ETH0
I0319 16:16:43.423108 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:16:43.423122 543705 net.go:698] Add success.
I0319 16:16:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:16:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:16:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:16:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:16:53.409769 543705 memory.go:184] no items to output this cycle
I0319 16:16:53.409798 543705 cpu.go:275] no items to output this cycle
E0319 16:17:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:17:03.409778 543705 memory.go:184] no items to output this cycle
I0319 16:17:03.409781 543705 cpu.go:275] no items to output this cycle
E0319 16:17:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:17:13.409790 543705 memory.go:191] Add success.
W0319 16:17:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:17:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:17:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:17:13.409872 543705 cpu.go:282] Add success.
I0319 16:17:13.420371 543705 net.go:648] Add success.
I0319 16:17:13.423285 543705 net.go:770] primary dev: ETH0
I0319 16:17:13.423299 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:17:13.423311 543705 net.go:698] Add success.
I0319 16:17:13.452836 543705 event_worker.go:152] Polling the log file for events...
W0319 16:17:14.455198 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:17:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0319 16:17:14.455215 543705 disk_worker.go:728] disk inode is not compliant
E0319 16:17:14.456025 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 16:17:14.456035 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 16:17:14.456041 543705 custom_config.go:64] query custom config with name: gpu
I0319 16:17:14.456605 543705 disk_worker.go:494] system disk:vda1
I0319 16:17:14.456638 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 16:17:15.456826 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 16:17:15.456834 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:17:16.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 16:17:16.457991 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 16:17:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:17:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:17:16.472383 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:17:19.933677 543705 disk_info.go:125] begin check local disk info of client
I0319 16:17:19.936139 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:17:19.936149 543705 disk_info.go:196] parse disk info done, disk is : [0xc00028fe80 0xc00028fec0]
E0319 16:17:23.409834 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:17:23.409853 543705 memory.go:184] no items to output this cycle
I0319 16:17:23.409994 543705 cpu.go:275] no items to output this cycle
E0319 16:17:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:17:33.409768 543705 memory.go:184] no items to output this cycle
I0319 16:17:33.409811 543705 cpu.go:275] no items to output this cycle
E0319 16:17:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:17:43.409789 543705 memory.go:191] Add success.
I0319 16:17:43.409796 543705 cpu.go:282] Add success.
I0319 16:17:43.419837 543705 net.go:648] Add success.
I0319 16:17:43.422635 543705 net.go:770] primary dev: ETH0
I0319 16:17:43.422648 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:17:43.422662 543705 net.go:698] Add success.
I0319 16:17:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:17:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:17:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:17:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:17:53.409793 543705 memory.go:184] no items to output this cycle
I0319 16:17:53.409806 543705 cpu.go:275] no items to output this cycle
E0319 16:18:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:18:03.409775 543705 memory.go:184] no items to output this cycle
I0319 16:18:03.409779 543705 cpu.go:275] no items to output this cycle
E0319 16:18:13.409808 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:18:13.409830 543705 cpu.go:282] Add success.
I0319 16:18:13.409839 543705 memory.go:191] Add success.
W0319 16:18:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:18:13.409892 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:18:13.409896 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:18:13.420310 543705 net.go:648] Add success.
I0319 16:18:13.423359 543705 net.go:770] primary dev: ETH0
I0319 16:18:13.423373 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:18:13.423386 543705 net.go:698] Add success.
I0319 16:18:13.463655 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"15fdc906-51ae-4e93-9965-f220ab77f6fc","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 16:18:13.463690 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 16:18:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:18:14.455199 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:18:14.455211 543705 disk_worker.go:708] disk space is not compliant
W0319 16:18:14.455214 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:18:14.456648 543705 disk_worker.go:494] system disk:vda1
I0319 16:18:14.456681 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:18:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:18:16.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:18:16.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:18:16.458081 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:18:16.472447 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:18:19.937672 543705 disk_info.go:125] begin check local disk info of client
I0319 16:18:19.940144 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:18:19.940150 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047c500 0xc00047c540]
E0319 16:18:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:18:23.409776 543705 memory.go:184] no items to output this cycle
I0319 16:18:23.409793 543705 cpu.go:275] no items to output this cycle
E0319 16:18:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:18:33.409799 543705 memory.go:184] no items to output this cycle
I0319 16:18:33.409811 543705 cpu.go:275] no items to output this cycle
I0319 16:18:37.765524 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 16:18:37.765529 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 16:18:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:18:43.410685 543705 memory.go:191] Add success.
I0319 16:18:43.409808 543705 cpu.go:282] Add success.
I0319 16:18:43.420381 543705 net.go:648] Add success.
I0319 16:18:43.423219 543705 net.go:770] primary dev: ETH0
I0319 16:18:43.423232 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:18:43.423246 543705 net.go:698] Add success.
I0319 16:18:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:18:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:18:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:18:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:18:53.409769 543705 memory.go:184] no items to output this cycle
I0319 16:18:53.409781 543705 cpu.go:275] no items to output this cycle
E0319 16:19:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:19:03.409804 543705 memory.go:184] no items to output this cycle
I0319 16:19:03.409816 543705 cpu.go:275] no items to output this cycle
E0319 16:19:13.409810 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:19:13.409816 543705 cpu.go:282] Add success.
I0319 16:19:13.409843 543705 memory.go:191] Add success.
W0319 16:19:13.409877 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:19:13.409896 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:19:13.409900 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:19:13.420413 543705 net.go:648] Add success.
I0319 16:19:13.423650 543705 net.go:770] primary dev: ETH0
I0319 16:19:13.423664 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:19:13.423678 543705 net.go:698] Add success.
I0319 16:19:14.454991 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:19:14.455188 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:19:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0319 16:19:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:19:14.456609 543705 disk_worker.go:494] system disk:vda1
I0319 16:19:14.456644 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:19:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:19:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:19:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:19:16.458074 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:19:16.472460 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:19:19.941671 543705 disk_info.go:125] begin check local disk info of client
I0319 16:19:19.944166 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:19:19.944173 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e2340 0xc0003e2380]
E0319 16:19:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:19:23.409795 543705 memory.go:184] no items to output this cycle
I0319 16:19:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 16:19:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:19:33.409797 543705 memory.go:184] no items to output this cycle
I0319 16:19:33.409811 543705 cpu.go:275] no items to output this cycle
E0319 16:19:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:19:43.409816 543705 memory.go:191] Add success.
I0319 16:19:43.409821 543705 cpu.go:282] Add success.
I0319 16:19:43.419863 543705 net.go:648] Add success.
I0319 16:19:43.422943 543705 net.go:770] primary dev: ETH0
I0319 16:19:43.422958 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:19:43.422972 543705 net.go:698] Add success.
I0319 16:19:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:19:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:19:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:19:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:19:53.409774 543705 memory.go:184] no items to output this cycle
I0319 16:19:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 16:20:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:20:03.409772 543705 memory.go:184] no items to output this cycle
I0319 16:20:03.409778 543705 cpu.go:275] no items to output this cycle
E0319 16:20:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:20:13.409782 543705 memory.go:191] Add success.
W0319 16:20:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:20:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:20:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:20:13.409863 543705 cpu.go:282] Add success.
I0319 16:20:13.420491 543705 net.go:648] Add success.
I0319 16:20:13.423368 543705 net.go:770] primary dev: ETH0
I0319 16:20:13.423383 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:20:13.423395 543705 net.go:698] Add success.
I0319 16:20:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:20:14.455219 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:20:14.455231 543705 disk_worker.go:708] disk space is not compliant
W0319 16:20:14.455234 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:20:14.456629 543705 disk_worker.go:494] system disk:vda1
I0319 16:20:14.456663 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:20:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:20:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:20:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:20:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:20:16.472447 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:20:19.945672 543705 disk_info.go:125] begin check local disk info of client
I0319 16:20:19.948116 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:20:19.948122 543705 disk_info.go:196] parse disk info done, disk is : [0xc000370480 0xc0003704c0]
E0319 16:20:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:20:23.409775 543705 cpu.go:275] no items to output this cycle
I0319 16:20:23.409784 543705 memory.go:184] no items to output this cycle
E0319 16:20:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:20:33.409806 543705 memory.go:184] no items to output this cycle
I0319 16:20:33.409818 543705 cpu.go:275] no items to output this cycle
E0319 16:20:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:20:43.409786 543705 memory.go:191] Add success.
I0319 16:20:43.409806 543705 cpu.go:282] Add success.
I0319 16:20:43.420002 543705 net.go:648] Add success.
I0319 16:20:43.422761 543705 net.go:770] primary dev: ETH0
I0319 16:20:43.422773 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:20:43.422785 543705 net.go:698] Add success.
I0319 16:20:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:20:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:20:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:20:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:20:53.409772 543705 memory.go:184] no items to output this cycle
I0319 16:20:53.409788 543705 cpu.go:275] no items to output this cycle
I0319 16:21:03.409786 543705 cpu.go:275] no items to output this cycle
E0319 16:21:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:21:03.409804 543705 memory.go:184] no items to output this cycle
E0319 16:21:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:21:13.409822 543705 memory.go:191] Add success.
I0319 16:21:13.409830 543705 cpu.go:282] Add success.
W0319 16:21:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:21:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:21:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:21:13.420217 543705 net.go:648] Add success.
I0319 16:21:13.422862 543705 net.go:770] primary dev: ETH0
I0319 16:21:13.422875 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:21:13.422887 543705 net.go:698] Add success.
I0319 16:21:13.469213 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9a4a10c0-920d-485f-9284-7e7e18554ec1","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 16:21:13.469244 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 16:21:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:21:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:21:14.455242 543705 disk_worker.go:708] disk space is not compliant
W0319 16:21:14.455246 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:21:14.456661 543705 disk_worker.go:494] system disk:vda1
I0319 16:21:14.456697 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:21:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:21:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:21:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:21:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:21:16.472391 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:21:19.949672 543705 disk_info.go:125] begin check local disk info of client
I0319 16:21:19.952034 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:21:19.952040 543705 disk_info.go:196] parse disk info done, disk is : [0xc000354300 0xc000354340]
E0319 16:21:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:21:23.409796 543705 memory.go:184] no items to output this cycle
I0319 16:21:23.409809 543705 cpu.go:275] no items to output this cycle
E0319 16:21:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:21:33.409770 543705 memory.go:184] no items to output this cycle
I0319 16:21:33.409801 543705 cpu.go:275] no items to output this cycle
I0319 16:21:37.765733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 16:21:37.765740 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 16:21:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:21:43.410708 543705 memory.go:191] Add success.
I0319 16:21:43.409825 543705 cpu.go:282] Add success.
I0319 16:21:43.420549 543705 net.go:648] Add success.
I0319 16:21:43.423462 543705 net.go:770] primary dev: ETH0
I0319 16:21:43.423480 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:21:43.423497 543705 net.go:698] Add success.
I0319 16:21:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:21:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:21:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:21:53.410280 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:21:53.410303 543705 memory.go:184] no items to output this cycle
I0319 16:21:53.410307 543705 cpu.go:275] no items to output this cycle
E0319 16:22:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:22:03.409776 543705 memory.go:184] no items to output this cycle
I0319 16:22:03.409781 543705 cpu.go:275] no items to output this cycle
I0319 16:22:13.409805 543705 cpu.go:282] Add success.
E0319 16:22:13.409804 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:22:13.409826 543705 memory.go:191] Add success.
W0319 16:22:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:22:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:22:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:22:13.420749 543705 net.go:648] Add success.
I0319 16:22:13.423799 543705 net.go:770] primary dev: ETH0
I0319 16:22:13.423812 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:22:13.423823 543705 net.go:698] Add success.
W0319 16:22:14.455208 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:22:14.455224 543705 disk_worker.go:708] disk space is not compliant
W0319 16:22:14.455227 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:22:14.456838 543705 disk_worker.go:494] system disk:vda1
I0319 16:22:14.456881 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 16:22:14.457095 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 16:22:14.457104 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 16:22:14.457110 543705 custom_config.go:64] query custom config with name: gpu
E0319 16:22:15.456876 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 16:22:15.456887 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:22:16.457942 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 16:22:16.457950 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 16:22:16.457994 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:22:16.458011 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:22:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:22:19.953671 543705 disk_info.go:125] begin check local disk info of client
I0319 16:22:19.956014 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:22:19.956020 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003584c0 0xc000358500]
E0319 16:22:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:22:23.409763 543705 memory.go:184] no items to output this cycle
I0319 16:22:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 16:22:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:22:33.409799 543705 memory.go:184] no items to output this cycle
I0319 16:22:33.409812 543705 cpu.go:275] no items to output this cycle
E0319 16:22:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:22:43.409804 543705 memory.go:191] Add success.
I0319 16:22:43.409805 543705 cpu.go:282] Add success.
I0319 16:22:43.420079 543705 net.go:648] Add success.
I0319 16:22:43.422946 543705 net.go:770] primary dev: ETH0
I0319 16:22:43.422960 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:22:43.422972 543705 net.go:698] Add success.
I0319 16:22:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:22:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:22:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:22:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:22:53.409774 543705 memory.go:184] no items to output this cycle
I0319 16:22:53.409796 543705 cpu.go:275] no items to output this cycle
E0319 16:23:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:23:03.409808 543705 memory.go:184] no items to output this cycle
I0319 16:23:03.409816 543705 cpu.go:275] no items to output this cycle
E0319 16:23:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:23:13.409797 543705 memory.go:191] Add success.
I0319 16:23:13.409818 543705 cpu.go:282] Add success.
W0319 16:23:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:23:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:23:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:23:13.420759 543705 net.go:648] Add success.
I0319 16:23:13.424260 543705 net.go:770] primary dev: ETH0
I0319 16:23:13.424273 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:23:13.424286 543705 net.go:698] Add success.
I0319 16:23:14.454953 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:23:14.455205 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:23:14.455219 543705 disk_worker.go:708] disk space is not compliant
W0319 16:23:14.455222 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:23:14.456615 543705 disk_worker.go:494] system disk:vda1
I0319 16:23:14.456646 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:23:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:23:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:23:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:23:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:23:16.472365 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:23:19.957674 543705 disk_info.go:125] begin check local disk info of client
I0319 16:23:19.960157 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:23:19.960163 543705 disk_info.go:196] parse disk info done, disk is : [0xc000499800 0xc000499840]
E0319 16:23:23.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:23:23.409811 543705 memory.go:184] no items to output this cycle
I0319 16:23:23.409818 543705 cpu.go:275] no items to output this cycle
E0319 16:23:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:23:33.409807 543705 memory.go:184] no items to output this cycle
I0319 16:23:33.409820 543705 cpu.go:275] no items to output this cycle
E0319 16:23:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:23:43.409784 543705 memory.go:191] Add success.
I0319 16:23:43.409816 543705 cpu.go:282] Add success.
I0319 16:23:43.419868 543705 net.go:648] Add success.
I0319 16:23:43.422856 543705 net.go:770] primary dev: ETH0
I0319 16:23:43.422870 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:23:43.422881 543705 net.go:698] Add success.
I0319 16:23:46.457996 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:23:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:23:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:23:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:23:53.409778 543705 memory.go:184] no items to output this cycle
I0319 16:23:53.409783 543705 cpu.go:275] no items to output this cycle
E0319 16:24:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:24:03.409767 543705 memory.go:184] no items to output this cycle
I0319 16:24:03.409789 543705 cpu.go:275] no items to output this cycle
E0319 16:24:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:24:13.409806 543705 memory.go:191] Add success.
I0319 16:24:13.409810 543705 cpu.go:282] Add success.
W0319 16:24:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:24:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:24:13.409856 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:24:13.420122 543705 net.go:648] Add success.
I0319 16:24:13.423136 543705 net.go:770] primary dev: ETH0
I0319 16:24:13.423149 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:24:13.423162 543705 net.go:698] Add success.
I0319 16:24:13.469026 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"98952ef8-f59c-40b6-a217-ff8ded3ff2ad","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 16:24:13.469057 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 16:24:14.454994 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:24:14.455141 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:24:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0319 16:24:14.455214 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:24:14.456551 543705 disk_worker.go:494] system disk:vda1
I0319 16:24:14.456600 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:24:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:24:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:24:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:24:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:24:16.472426 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:24:19.961669 543705 disk_info.go:125] begin check local disk info of client
I0319 16:24:19.964014 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:24:19.964020 543705 disk_info.go:196] parse disk info done, disk is : [0xc000463a00 0xc000463a40]
E0319 16:24:23.410239 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:24:23.410255 543705 memory.go:184] no items to output this cycle
I0319 16:24:23.410271 543705 cpu.go:275] no items to output this cycle
E0319 16:24:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:24:33.409779 543705 memory.go:184] no items to output this cycle
I0319 16:24:33.409783 543705 cpu.go:275] no items to output this cycle
I0319 16:24:37.767528 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 16:24:37.767535 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 16:24:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:24:43.410839 543705 memory.go:191] Add success.
I0319 16:24:43.409829 543705 cpu.go:282] Add success.
I0319 16:24:43.420556 543705 net.go:648] Add success.
I0319 16:24:43.423230 543705 net.go:770] primary dev: ETH0
I0319 16:24:43.423243 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:24:43.423256 543705 net.go:698] Add success.
I0319 16:24:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:24:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:24:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:24:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:24:53.409776 543705 cpu.go:275] no items to output this cycle
I0319 16:24:53.409780 543705 memory.go:184] no items to output this cycle
E0319 16:25:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:25:03.409794 543705 memory.go:184] no items to output this cycle
I0319 16:25:03.409807 543705 cpu.go:275] no items to output this cycle
E0319 16:25:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:25:13.409816 543705 memory.go:191] Add success.
I0319 16:25:13.409826 543705 cpu.go:282] Add success.
W0319 16:25:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:25:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:25:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:25:13.420209 543705 net.go:648] Add success.
I0319 16:25:13.422901 543705 net.go:770] primary dev: ETH0
I0319 16:25:13.422917 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:25:13.422933 543705 net.go:698] Add success.
I0319 16:25:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:25:14.455149 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:25:14.455237 543705 disk_worker.go:708] disk space is not compliant
W0319 16:25:14.455241 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:25:14.456671 543705 disk_worker.go:494] system disk:vda1
I0319 16:25:14.456705 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:25:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:25:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:25:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:25:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:25:16.472417 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:25:19.965675 543705 disk_info.go:125] begin check local disk info of client
I0319 16:25:19.968064 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:25:19.968070 543705 disk_info.go:196] parse disk info done, disk is : [0xc00036a2c0 0xc00036a300]
E0319 16:25:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:25:23.409791 543705 memory.go:184] no items to output this cycle
I0319 16:25:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 16:25:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:25:33.409778 543705 memory.go:184] no items to output this cycle
I0319 16:25:33.409783 543705 cpu.go:275] no items to output this cycle
E0319 16:25:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:25:43.409786 543705 memory.go:191] Add success.
I0319 16:25:43.409785 543705 cpu.go:282] Add success.
I0319 16:25:43.419964 543705 net.go:648] Add success.
I0319 16:25:43.422800 543705 net.go:770] primary dev: ETH0
I0319 16:25:43.422813 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:25:43.422827 543705 net.go:698] Add success.
I0319 16:25:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:25:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:25:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:25:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:25:53.409782 543705 memory.go:184] no items to output this cycle
I0319 16:25:53.409785 543705 cpu.go:275] no items to output this cycle
E0319 16:26:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:26:03.409798 543705 memory.go:184] no items to output this cycle
I0319 16:26:03.409805 543705 cpu.go:275] no items to output this cycle
E0319 16:26:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:26:13.409791 543705 memory.go:191] Add success.
I0319 16:26:13.409809 543705 cpu.go:282] Add success.
W0319 16:26:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:26:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:26:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:26:13.420117 543705 net.go:648] Add success.
I0319 16:26:13.422801 543705 net.go:770] primary dev: ETH0
I0319 16:26:13.422819 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:26:13.422834 543705 net.go:698] Add success.
I0319 16:26:14.454986 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:26:14.455217 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:26:14.455229 543705 disk_worker.go:708] disk space is not compliant
W0319 16:26:14.455232 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:26:14.456616 543705 disk_worker.go:494] system disk:vda1
I0319 16:26:14.456651 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:26:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:26:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:26:16.458150 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:26:16.458176 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:26:16.472091 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:26:19.969677 543705 disk_info.go:125] begin check local disk info of client
I0319 16:26:19.972119 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:26:19.972125 543705 disk_info.go:196] parse disk info done, disk is : [0xc000462080 0xc0004620c0]
E0319 16:26:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:26:23.409780 543705 memory.go:184] no items to output this cycle
I0319 16:26:23.409786 543705 cpu.go:275] no items to output this cycle
E0319 16:26:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:26:33.409786 543705 cpu.go:275] no items to output this cycle
I0319 16:26:33.409789 543705 memory.go:184] no items to output this cycle
E0319 16:26:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:26:43.409799 543705 memory.go:191] Add success.
I0319 16:26:43.409800 543705 cpu.go:282] Add success.
I0319 16:26:43.419886 543705 net.go:648] Add success.
I0319 16:26:43.422706 543705 net.go:770] primary dev: ETH0
I0319 16:26:43.422737 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:26:43.422753 543705 net.go:698] Add success.
I0319 16:26:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:26:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:26:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:26:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:26:53.409766 543705 memory.go:184] no items to output this cycle
I0319 16:26:53.409791 543705 cpu.go:275] no items to output this cycle
E0319 16:27:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:27:03.409781 543705 memory.go:184] no items to output this cycle
I0319 16:27:03.409787 543705 cpu.go:275] no items to output this cycle
E0319 16:27:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:27:13.409792 543705 memory.go:191] Add success.
I0319 16:27:13.409792 543705 cpu.go:282] Add success.
W0319 16:27:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:27:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:27:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:27:13.420044 543705 net.go:648] Add success.
I0319 16:27:13.422869 543705 net.go:770] primary dev: ETH0
I0319 16:27:13.422882 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:27:13.422895 543705 net.go:698] Add success.
I0319 16:27:13.429580 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 16:27:13.452769 543705 event_worker.go:152] Polling the log file for events...
I0319 16:27:13.462949 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"065f536e-6886-40dd-b622-425b8ad179b2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 16:27:13.462982 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 16:27:14.455140 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:27:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0319 16:27:14.455211 543705 disk_worker.go:728] disk inode is not compliant
E0319 16:27:14.456335 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 16:27:14.456347 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 16:27:14.456354 543705 custom_config.go:64] query custom config with name: gpu
I0319 16:27:14.456625 543705 disk_worker.go:494] system disk:vda1
I0319 16:27:14.456678 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 16:27:15.456835 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 16:27:15.456846 543705 custom_config.go:64] query custom config with name: huawei_npu
E0319 16:27:16.457973 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 16:27:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:27:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:27:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:27:16.472400 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:27:19.973669 543705 disk_info.go:125] begin check local disk info of client
I0319 16:27:19.976037 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:27:19.976043 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047ce00 0xc00047ce40]
E0319 16:27:23.409892 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:27:23.410012 543705 memory.go:184] no items to output this cycle
I0319 16:27:23.410014 543705 cpu.go:275] no items to output this cycle
E0319 16:27:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:27:33.409779 543705 memory.go:184] no items to output this cycle
I0319 16:27:33.409792 543705 cpu.go:275] no items to output this cycle
I0319 16:27:37.768545 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 16:27:37.768551 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 16:27:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:27:43.410716 543705 memory.go:191] Add success.
I0319 16:27:43.409790 543705 cpu.go:282] Add success.
I0319 16:27:43.420419 543705 net.go:648] Add success.
I0319 16:27:43.423077 543705 net.go:770] primary dev: ETH0
I0319 16:27:43.423091 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:27:43.423106 543705 net.go:698] Add success.
I0319 16:27:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:27:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:27:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:27:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:27:53.409801 543705 memory.go:184] no items to output this cycle
I0319 16:27:53.409814 543705 cpu.go:275] no items to output this cycle
E0319 16:28:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:28:03.409771 543705 memory.go:184] no items to output this cycle
I0319 16:28:03.409789 543705 cpu.go:275] no items to output this cycle
E0319 16:28:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:28:13.409803 543705 memory.go:191] Add success.
I0319 16:28:13.409817 543705 cpu.go:282] Add success.
W0319 16:28:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:28:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:28:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:28:13.420196 543705 net.go:648] Add success.
I0319 16:28:13.422815 543705 net.go:770] primary dev: ETH0
I0319 16:28:13.422830 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:28:13.422844 543705 net.go:698] Add success.
I0319 16:28:14.454992 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:28:14.455146 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:28:14.455223 543705 disk_worker.go:708] disk space is not compliant
W0319 16:28:14.455227 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:28:14.456634 543705 disk_worker.go:494] system disk:vda1
I0319 16:28:14.456669 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:28:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:28:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:28:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:28:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:28:16.472398 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:28:19.977677 543705 disk_info.go:125] begin check local disk info of client
I0319 16:28:19.980088 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:28:19.980094 543705 disk_info.go:196] parse disk info done, disk is : [0xc000286340 0xc000286380]
E0319 16:28:23.409886 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:28:23.409903 543705 memory.go:184] no items to output this cycle
I0319 16:28:23.409920 543705 cpu.go:275] no items to output this cycle
E0319 16:28:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:28:33.409776 543705 memory.go:184] no items to output this cycle
I0319 16:28:33.409785 543705 cpu.go:275] no items to output this cycle
E0319 16:28:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:28:43.409826 543705 memory.go:191] Add success.
I0319 16:28:43.409828 543705 cpu.go:282] Add success.
I0319 16:28:43.420011 543705 net.go:648] Add success.
I0319 16:28:43.422984 543705 net.go:770] primary dev: ETH0
I0319 16:28:43.422999 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:28:43.423014 543705 net.go:698] Add success.
I0319 16:28:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:28:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:28:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:28:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:28:53.409781 543705 cpu.go:275] no items to output this cycle
I0319 16:28:53.409784 543705 memory.go:184] no items to output this cycle
E0319 16:29:03.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:29:03.409761 543705 memory.go:184] no items to output this cycle
I0319 16:29:03.409799 543705 cpu.go:275] no items to output this cycle
E0319 16:29:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:29:13.409798 543705 memory.go:191] Add success.
I0319 16:29:13.409799 543705 cpu.go:282] Add success.
W0319 16:29:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:29:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:29:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:29:13.420120 543705 net.go:648] Add success.
I0319 16:29:13.423010 543705 net.go:770] primary dev: ETH0
I0319 16:29:13.423024 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:29:13.423036 543705 net.go:698] Add success.
I0319 16:29:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:29:14.455210 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:29:14.455223 543705 disk_worker.go:708] disk space is not compliant
W0319 16:29:14.455226 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:29:14.456664 543705 disk_worker.go:494] system disk:vda1
I0319 16:29:14.456701 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:29:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:29:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:29:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:29:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:29:16.472409 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:29:19.981673 543705 disk_info.go:125] begin check local disk info of client
I0319 16:29:19.984110 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:29:19.984116 543705 disk_info.go:196] parse disk info done, disk is : [0xc000278c00 0xc000278c40]
E0319 16:29:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:29:23.409901 543705 cpu.go:275] no items to output this cycle
I0319 16:29:23.409904 543705 memory.go:184] no items to output this cycle
E0319 16:29:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:29:33.409805 543705 memory.go:184] no items to output this cycle
I0319 16:29:33.409819 543705 cpu.go:275] no items to output this cycle
E0319 16:29:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:29:43.409809 543705 memory.go:191] Add success.
I0319 16:29:43.409812 543705 cpu.go:282] Add success.
I0319 16:29:43.419911 543705 net.go:648] Add success.
I0319 16:29:43.422803 543705 net.go:770] primary dev: ETH0
I0319 16:29:43.422816 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:29:43.422828 543705 net.go:698] Add success.
I0319 16:29:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:29:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:29:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:29:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:29:53.409799 543705 memory.go:184] no items to output this cycle
I0319 16:29:53.409810 543705 cpu.go:275] no items to output this cycle
E0319 16:30:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:30:03.409769 543705 memory.go:184] no items to output this cycle
I0319 16:30:03.409791 543705 cpu.go:275] no items to output this cycle
E0319 16:30:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:30:13.409794 543705 memory.go:191] Add success.
I0319 16:30:13.409796 543705 cpu.go:282] Add success.
W0319 16:30:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:30:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:30:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:30:13.420162 543705 net.go:648] Add success.
I0319 16:30:13.423135 543705 net.go:770] primary dev: ETH0
I0319 16:30:13.423148 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:30:13.423160 543705 net.go:698] Add success.
I0319 16:30:13.464401 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4964824d-67d4-421e-8133-4ccb33eeaaaa","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 16:30:13.464437 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 16:30:14.454989 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:30:14.455154 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:30:14.455240 543705 disk_worker.go:708] disk space is not compliant
W0319 16:30:14.455245 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:30:14.456679 543705 disk_worker.go:494] system disk:vda1
I0319 16:30:14.456714 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:30:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:30:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:30:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:30:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:30:16.472398 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:30:19.985672 543705 disk_info.go:125] begin check local disk info of client
I0319 16:30:19.988065 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:30:19.988071 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032ed00 0xc00032ed40]
E0319 16:30:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:30:23.409793 543705 memory.go:184] no items to output this cycle
I0319 16:30:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 16:30:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:30:33.409877 543705 memory.go:184] no items to output this cycle
I0319 16:30:33.409917 543705 cpu.go:275] no items to output this cycle
I0319 16:30:37.769543 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 16:30:37.769549 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 16:30:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:30:43.410781 543705 memory.go:191] Add success.
I0319 16:30:43.409847 543705 cpu.go:282] Add success.
I0319 16:30:43.420497 543705 net.go:648] Add success.
I0319 16:30:43.423547 543705 net.go:770] primary dev: ETH0
I0319 16:30:43.423562 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:30:43.423577 543705 net.go:698] Add success.
I0319 16:30:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:30:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:30:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:30:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:30:53.409770 543705 memory.go:184] no items to output this cycle
I0319 16:30:53.409788 543705 cpu.go:275] no items to output this cycle
E0319 16:31:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:31:03.409796 543705 memory.go:184] no items to output this cycle
I0319 16:31:03.409812 543705 cpu.go:275] no items to output this cycle
E0319 16:31:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:31:13.409790 543705 memory.go:191] Add success.
I0319 16:31:13.409814 543705 cpu.go:282] Add success.
W0319 16:31:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:31:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:31:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:31:13.420105 543705 net.go:648] Add success.
I0319 16:31:13.423090 543705 net.go:770] primary dev: ETH0
I0319 16:31:13.423104 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:31:13.423116 543705 net.go:698] Add success.
I0319 16:31:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:31:14.455241 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:31:14.455254 543705 disk_worker.go:708] disk space is not compliant
W0319 16:31:14.455257 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:31:14.456696 543705 disk_worker.go:494] system disk:vda1
I0319 16:31:14.456729 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:31:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:31:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:31:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:31:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:31:16.472406 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:31:19.989678 543705 disk_info.go:125] begin check local disk info of client
I0319 16:31:19.992065 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:31:19.992071 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001c6980 0xc0001c69c0]
E0319 16:31:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:31:23.409801 543705 memory.go:184] no items to output this cycle
I0319 16:31:23.409815 543705 cpu.go:275] no items to output this cycle
E0319 16:31:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:31:33.409783 543705 memory.go:184] no items to output this cycle
I0319 16:31:33.409798 543705 cpu.go:275] no items to output this cycle
E0319 16:31:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:31:43.409806 543705 memory.go:191] Add success.
I0319 16:31:43.409809 543705 cpu.go:282] Add success.
I0319 16:31:43.420044 543705 net.go:648] Add success.
I0319 16:31:43.423271 543705 net.go:770] primary dev: ETH0
I0319 16:31:43.423285 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:31:43.423297 543705 net.go:698] Add success.
I0319 16:31:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:31:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:31:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:31:53.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:31:53.409817 543705 memory.go:184] no items to output this cycle
I0319 16:31:53.409825 543705 cpu.go:275] no items to output this cycle
E0319 16:32:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:32:03.409783 543705 memory.go:184] no items to output this cycle
I0319 16:32:03.409809 543705 cpu.go:275] no items to output this cycle
E0319 16:32:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:32:13.409790 543705 memory.go:191] Add success.
I0319 16:32:13.409808 543705 cpu.go:282] Add success.
W0319 16:32:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:32:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:32:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:32:13.420050 543705 net.go:648] Add success.
I0319 16:32:13.422921 543705 net.go:770] primary dev: ETH0
I0319 16:32:13.422934 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:32:13.422946 543705 net.go:698] Add success.
W0319 16:32:14.455141 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:32:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0319 16:32:14.455212 543705 disk_worker.go:728] disk inode is not compliant
E0319 16:32:14.457152 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 16:32:14.457164 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 16:32:14.457169 543705 disk_worker.go:494] system disk:vda1
I0319 16:32:14.457170 543705 custom_config.go:64] query custom config with name: gpu
I0319 16:32:14.457205 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 16:32:15.456822 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 16:32:15.456831 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:32:16.457958 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 16:32:16.457958 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 16:32:16.458011 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:32:16.458033 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:32:16.472446 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:32:19.993671 543705 disk_info.go:125] begin check local disk info of client
I0319 16:32:19.996105 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:32:19.996110 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5d40 0xc0000c5d80]
E0319 16:32:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:32:23.409781 543705 memory.go:184] no items to output this cycle
I0319 16:32:23.409806 543705 cpu.go:275] no items to output this cycle
E0319 16:32:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:32:33.409804 543705 memory.go:184] no items to output this cycle
I0319 16:32:33.409821 543705 cpu.go:275] no items to output this cycle
E0319 16:32:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:32:43.409826 543705 memory.go:191] Add success.
I0319 16:32:43.409830 543705 cpu.go:282] Add success.
I0319 16:32:43.419714 543705 net.go:648] Add success.
I0319 16:32:43.422522 543705 net.go:770] primary dev: ETH0
I0319 16:32:43.422536 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:32:43.422547 543705 net.go:698] Add success.
I0319 16:32:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:32:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:32:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:32:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:32:53.409803 543705 memory.go:184] no items to output this cycle
I0319 16:32:53.409814 543705 cpu.go:275] no items to output this cycle
E0319 16:33:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:33:03.409791 543705 cpu.go:275] no items to output this cycle
I0319 16:33:03.409812 543705 memory.go:184] no items to output this cycle
E0319 16:33:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:33:13.409830 543705 memory.go:191] Add success.
I0319 16:33:13.409832 543705 cpu.go:282] Add success.
W0319 16:33:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:33:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:33:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:33:13.420312 543705 net.go:648] Add success.
I0319 16:33:13.423138 543705 net.go:770] primary dev: ETH0
I0319 16:33:13.423151 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:33:13.423164 543705 net.go:698] Add success.
I0319 16:33:13.469314 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"774153b2-85d4-4a9a-96a7-6f16d40933e2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 16:33:13.469348 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 16:33:14.454992 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:33:14.455164 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:33:14.455231 543705 disk_worker.go:708] disk space is not compliant
W0319 16:33:14.455234 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:33:14.456644 543705 disk_worker.go:494] system disk:vda1
I0319 16:33:14.456677 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:33:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:33:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:33:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:33:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:33:16.472379 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:33:19.997672 543705 disk_info.go:125] begin check local disk info of client
I0319 16:33:20.000100 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:33:20.000107 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b1240 0xc0003b1280]
E0319 16:33:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:33:23.409790 543705 memory.go:184] no items to output this cycle
I0319 16:33:23.409809 543705 cpu.go:275] no items to output this cycle
E0319 16:33:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:33:33.409780 543705 memory.go:184] no items to output this cycle
I0319 16:33:33.409785 543705 cpu.go:275] no items to output this cycle
I0319 16:33:37.769733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 16:33:37.769740 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 16:33:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:33:43.410686 543705 memory.go:191] Add success.
I0319 16:33:43.409813 543705 cpu.go:282] Add success.
I0319 16:33:43.419745 543705 net.go:648] Add success.
I0319 16:33:43.422572 543705 net.go:770] primary dev: ETH0
I0319 16:33:43.422585 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:33:43.422597 543705 net.go:698] Add success.
I0319 16:33:46.457995 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:33:46.458068 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:33:46.458097 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:33:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:33:53.409803 543705 memory.go:184] no items to output this cycle
I0319 16:33:53.409814 543705 cpu.go:275] no items to output this cycle
E0319 16:34:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:34:03.409804 543705 memory.go:184] no items to output this cycle
I0319 16:34:03.409815 543705 cpu.go:275] no items to output this cycle
E0319 16:34:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:34:13.409785 543705 memory.go:191] Add success.
I0319 16:34:13.409807 543705 cpu.go:282] Add success.
W0319 16:34:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:34:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:34:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:34:13.420139 543705 net.go:648] Add success.
I0319 16:34:13.422957 543705 net.go:770] primary dev: ETH0
I0319 16:34:13.422972 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:34:13.422987 543705 net.go:698] Add success.
I0319 16:34:14.453930 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:34:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:34:14.455226 543705 disk_worker.go:708] disk space is not compliant
W0319 16:34:14.455230 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:34:14.458066 543705 disk_worker.go:494] system disk:vda1
I0319 16:34:14.458100 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:34:15.455976 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:34:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:34:16.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:34:16.458080 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:34:16.472534 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:34:20.001672 543705 disk_info.go:125] begin check local disk info of client
I0319 16:34:20.004195 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:34:20.004201 543705 disk_info.go:196] parse disk info done, disk is : [0xc000250d80 0xc000250dc0]
E0319 16:34:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:34:23.409791 543705 memory.go:184] no items to output this cycle
I0319 16:34:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 16:34:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:34:33.409770 543705 memory.go:184] no items to output this cycle
I0319 16:34:33.409794 543705 cpu.go:275] no items to output this cycle
E0319 16:34:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:34:43.409793 543705 memory.go:191] Add success.
I0319 16:34:43.409812 543705 cpu.go:282] Add success.
I0319 16:34:43.419839 543705 net.go:770] primary dev: ETH0
I0319 16:34:43.419852 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:34:43.419864 543705 net.go:698] Add success.
I0319 16:34:43.420347 543705 net.go:648] Add success.
I0319 16:34:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:34:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:34:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:34:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:34:53.409786 543705 memory.go:184] no items to output this cycle
I0319 16:34:53.409790 543705 cpu.go:275] no items to output this cycle
E0319 16:35:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:35:03.409784 543705 cpu.go:275] no items to output this cycle
I0319 16:35:03.409793 543705 memory.go:184] no items to output this cycle
E0319 16:35:13.410701 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:35:13.410727 543705 memory.go:191] Add success.
I0319 16:35:13.410728 543705 cpu.go:282] Add success.
W0319 16:35:13.410757 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:35:13.410769 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:35:13.410772 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:35:13.420162 543705 net.go:648] Add success.
I0319 16:35:13.423295 543705 net.go:770] primary dev: ETH0
I0319 16:35:13.423308 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:35:13.423320 543705 net.go:698] Add success.
I0319 16:35:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:35:14.455207 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:35:14.455218 543705 disk_worker.go:708] disk space is not compliant
W0319 16:35:14.455221 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:35:14.457424 543705 disk_worker.go:494] system disk:vda1
I0319 16:35:14.457456 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:35:15.455977 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:35:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:35:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:35:16.458079 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:35:16.472431 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:35:20.005675 543705 disk_info.go:125] begin check local disk info of client
I0319 16:35:20.008213 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:35:20.008219 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab8c0 0xc0001ab900]
E0319 16:35:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:35:23.409762 543705 memory.go:184] no items to output this cycle
I0319 16:35:23.409796 543705 cpu.go:275] no items to output this cycle
E0319 16:35:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:35:33.409768 543705 memory.go:184] no items to output this cycle
I0319 16:35:33.409796 543705 cpu.go:275] no items to output this cycle
E0319 16:35:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:35:43.409805 543705 memory.go:191] Add success.
I0319 16:35:43.409816 543705 cpu.go:282] Add success.
I0319 16:35:43.420045 543705 net.go:648] Add success.
I0319 16:35:43.422778 543705 net.go:770] primary dev: ETH0
I0319 16:35:43.422791 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:35:43.422803 543705 net.go:698] Add success.
I0319 16:35:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:35:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:35:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:35:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:35:53.409786 543705 memory.go:184] no items to output this cycle
I0319 16:35:53.409787 543705 cpu.go:275] no items to output this cycle
E0319 16:36:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:36:03.409784 543705 memory.go:184] no items to output this cycle
I0319 16:36:03.409786 543705 cpu.go:275] no items to output this cycle
E0319 16:36:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:36:13.409793 543705 cpu.go:282] Add success.
I0319 16:36:13.409804 543705 memory.go:191] Add success.
W0319 16:36:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:36:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:36:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:36:13.420038 543705 net.go:648] Add success.
I0319 16:36:13.422957 543705 net.go:770] primary dev: ETH0
I0319 16:36:13.422971 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:36:13.422984 543705 net.go:698] Add success.
I0319 16:36:13.999999 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5cfb3401-4158-4656-8880-b55050adca59","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 16:36:14.000033 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 16:36:14.454439 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:36:14.454664 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:36:14.454676 543705 disk_worker.go:708] disk space is not compliant
W0319 16:36:14.454679 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:36:14.456137 543705 disk_worker.go:494] system disk:vda1
I0319 16:36:14.456186 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:36:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:36:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:36:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:36:16.458083 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:36:16.472444 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:36:20.009676 543705 disk_info.go:125] begin check local disk info of client
I0319 16:36:20.012327 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:36:20.012333 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab000 0xc0001ab040]
E0319 16:36:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:36:23.409771 543705 memory.go:184] no items to output this cycle
I0319 16:36:23.409795 543705 cpu.go:275] no items to output this cycle
E0319 16:36:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:36:33.409783 543705 cpu.go:275] no items to output this cycle
I0319 16:36:33.409787 543705 memory.go:184] no items to output this cycle
I0319 16:36:37.771554 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 16:36:37.771561 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 16:36:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:36:43.410684 543705 memory.go:191] Add success.
I0319 16:36:43.409830 543705 cpu.go:282] Add success.
I0319 16:36:43.419740 543705 net.go:648] Add success.
I0319 16:36:43.422581 543705 net.go:770] primary dev: ETH0
I0319 16:36:43.422594 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:36:43.422607 543705 net.go:698] Add success.
I0319 16:36:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:36:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:36:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:36:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:36:53.409791 543705 memory.go:184] no items to output this cycle
I0319 16:36:53.409804 543705 cpu.go:275] no items to output this cycle
E0319 16:37:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:37:03.409789 543705 memory.go:184] no items to output this cycle
I0319 16:37:03.409788 543705 cpu.go:275] no items to output this cycle
E0319 16:37:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:37:13.409787 543705 memory.go:191] Add success.
I0319 16:37:13.409789 543705 cpu.go:282] Add success.
W0319 16:37:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:37:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:37:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:37:13.420079 543705 net.go:648] Add success.
I0319 16:37:13.422864 543705 net.go:770] primary dev: ETH0
I0319 16:37:13.422877 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:37:13.422892 543705 net.go:698] Add success.
I0319 16:37:13.453460 543705 event_worker.go:152] Polling the log file for events...
W0319 16:37:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:37:14.455289 543705 disk_worker.go:708] disk space is not compliant
W0319 16:37:14.455294 543705 disk_worker.go:728] disk inode is not compliant
E0319 16:37:14.456237 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 16:37:14.456248 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 16:37:14.456255 543705 custom_config.go:64] query custom config with name: gpu
I0319 16:37:14.457320 543705 disk_worker.go:494] system disk:vda1
I0319 16:37:14.457370 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 16:37:15.457006 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 16:37:15.457020 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:37:16.457923 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 16:37:16.457923 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 16:37:16.457979 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:37:16.458003 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:37:16.472358 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:37:20.013676 543705 disk_info.go:125] begin check local disk info of client
I0319 16:37:20.016180 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:37:20.016187 543705 disk_info.go:196] parse disk info done, disk is : [0xc000531c80 0xc000531cc0]
E0319 16:37:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:37:23.409763 543705 memory.go:184] no items to output this cycle
I0319 16:37:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 16:37:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:37:33.409793 543705 memory.go:184] no items to output this cycle
I0319 16:37:33.409805 543705 cpu.go:275] no items to output this cycle
E0319 16:37:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:37:43.409877 543705 memory.go:191] Add success.
I0319 16:37:43.409909 543705 cpu.go:282] Add success.
I0319 16:37:43.419709 543705 net.go:648] Add success.
I0319 16:37:43.423086 543705 net.go:770] primary dev: ETH0
I0319 16:37:43.423098 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:37:43.423110 543705 net.go:698] Add success.
I0319 16:37:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:37:46.458063 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:37:46.458089 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:37:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:37:53.409774 543705 memory.go:184] no items to output this cycle
I0319 16:37:53.409799 543705 cpu.go:275] no items to output this cycle
E0319 16:38:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:38:03.409782 543705 memory.go:184] no items to output this cycle
I0319 16:38:03.409789 543705 cpu.go:275] no items to output this cycle
E0319 16:38:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:38:13.409797 543705 memory.go:191] Add success.
I0319 16:38:13.409796 543705 cpu.go:282] Add success.
W0319 16:38:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:38:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:38:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:38:13.420182 543705 net.go:648] Add success.
I0319 16:38:13.423140 543705 net.go:770] primary dev: ETH0
I0319 16:38:13.423154 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:38:13.423166 543705 net.go:698] Add success.
I0319 16:38:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:38:14.455210 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:38:14.455222 543705 disk_worker.go:708] disk space is not compliant
W0319 16:38:14.455226 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:38:14.456693 543705 disk_worker.go:494] system disk:vda1
I0319 16:38:14.456727 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:38:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:38:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:38:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:38:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:38:16.472443 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:38:20.017673 543705 disk_info.go:125] begin check local disk info of client
I0319 16:38:20.020309 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:38:20.020316 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035c000 0xc00035c040]
E0319 16:38:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:38:23.409776 543705 memory.go:184] no items to output this cycle
I0319 16:38:23.409781 543705 cpu.go:275] no items to output this cycle
E0319 16:38:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:38:33.409794 543705 memory.go:184] no items to output this cycle
I0319 16:38:33.409809 543705 cpu.go:275] no items to output this cycle
E0319 16:38:43.409887 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:38:43.409922 543705 memory.go:191] Add success.
I0319 16:38:43.410001 543705 cpu.go:282] Add success.
I0319 16:38:43.419719 543705 net.go:648] Add success.
I0319 16:38:43.422871 543705 net.go:770] primary dev: ETH0
I0319 16:38:43.422884 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:38:43.422896 543705 net.go:698] Add success.
I0319 16:38:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:38:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:38:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:38:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:38:53.409777 543705 memory.go:184] no items to output this cycle
I0319 16:38:53.409783 543705 cpu.go:275] no items to output this cycle
E0319 16:39:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:39:03.409781 543705 memory.go:184] no items to output this cycle
I0319 16:39:03.409781 543705 cpu.go:275] no items to output this cycle
E0319 16:39:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:39:13.409786 543705 memory.go:191] Add success.
I0319 16:39:13.409786 543705 cpu.go:282] Add success.
W0319 16:39:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:39:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:39:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:39:13.420179 543705 net.go:648] Add success.
I0319 16:39:13.422999 543705 net.go:770] primary dev: ETH0
I0319 16:39:13.423013 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:39:13.423027 543705 net.go:698] Add success.
I0319 16:39:13.498779 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7ab4cc4b-60a3-414f-be3e-73011f0253be","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 16:39:13.498812 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 16:39:14.454991 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:39:14.455208 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:39:14.455218 543705 disk_worker.go:708] disk space is not compliant
W0319 16:39:14.455221 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:39:14.456832 543705 disk_worker.go:494] system disk:vda1
I0319 16:39:14.456878 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:39:15.455977 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:39:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:39:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:39:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:39:16.472483 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:39:20.021676 543705 disk_info.go:125] begin check local disk info of client
I0319 16:39:20.024216 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:39:20.024222 543705 disk_info.go:196] parse disk info done, disk is : [0xc000513b40 0xc000513b80]
E0319 16:39:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:39:23.409798 543705 memory.go:184] no items to output this cycle
I0319 16:39:23.409812 543705 cpu.go:275] no items to output this cycle
E0319 16:39:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:39:33.409793 543705 memory.go:184] no items to output this cycle
I0319 16:39:33.409807 543705 cpu.go:275] no items to output this cycle
I0319 16:39:37.772575 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 16:39:37.772581 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 16:39:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:39:43.410717 543705 memory.go:191] Add success.
I0319 16:39:43.409828 543705 cpu.go:282] Add success.
I0319 16:39:43.420411 543705 net.go:648] Add success.
I0319 16:39:43.423446 543705 net.go:770] primary dev: ETH0
I0319 16:39:43.423459 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:39:43.423471 543705 net.go:698] Add success.
I0319 16:39:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:39:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:39:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:39:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:39:53.409771 543705 memory.go:184] no items to output this cycle
I0319 16:39:53.409794 543705 cpu.go:275] no items to output this cycle
E0319 16:40:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:40:03.409802 543705 memory.go:184] no items to output this cycle
I0319 16:40:03.409815 543705 cpu.go:275] no items to output this cycle
E0319 16:40:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:40:13.409808 543705 memory.go:191] Add success.
I0319 16:40:13.409820 543705 cpu.go:282] Add success.
W0319 16:40:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:40:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:40:13.409859 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:40:13.420117 543705 net.go:648] Add success.
I0319 16:40:13.422885 543705 net.go:770] primary dev: ETH0
I0319 16:40:13.422898 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:40:13.422911 543705 net.go:698] Add success.
W0319 16:40:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:40:14.455267 543705 disk_worker.go:708] disk space is not compliant
W0319 16:40:14.455272 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:40:14.455635 543705 custom_config.go:64] query custom config with name: gpu
I0319 16:40:14.457447 543705 disk_worker.go:494] system disk:vda1
I0319 16:40:14.457495 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:40:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:40:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:40:16.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:40:16.458091 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:40:16.472483 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:40:20.025672 543705 disk_info.go:125] begin check local disk info of client
I0319 16:40:20.028195 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:40:20.028201 543705 disk_info.go:196] parse disk info done, disk is : [0xc00036a800 0xc00036a840]
E0319 16:40:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:40:23.409789 543705 memory.go:184] no items to output this cycle
I0319 16:40:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 16:40:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:40:33.409799 543705 memory.go:184] no items to output this cycle
I0319 16:40:33.409814 543705 cpu.go:275] no items to output this cycle
E0319 16:40:43.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:40:43.409830 543705 memory.go:191] Add success.
I0319 16:40:43.409837 543705 cpu.go:282] Add success.
I0319 16:40:43.420209 543705 net.go:770] primary dev: ETH0
I0319 16:40:43.420223 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:40:43.420236 543705 net.go:698] Add success.
I0319 16:40:43.420470 543705 net.go:648] Add success.
I0319 16:40:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:40:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:40:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:40:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:40:53.409772 543705 memory.go:184] no items to output this cycle
I0319 16:40:53.409774 543705 cpu.go:275] no items to output this cycle
E0319 16:41:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:41:03.409796 543705 memory.go:184] no items to output this cycle
I0319 16:41:03.409809 543705 cpu.go:275] no items to output this cycle
E0319 16:41:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:41:13.409783 543705 memory.go:191] Add success.
W0319 16:41:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 16:41:13.409814 543705 cpu.go:282] Add success.
W0319 16:41:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:41:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:41:13.420155 543705 net.go:648] Add success.
I0319 16:41:13.422998 543705 net.go:770] primary dev: ETH0
I0319 16:41:13.423016 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:41:13.423030 543705 net.go:698] Add success.
I0319 16:41:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:41:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:41:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0319 16:41:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:41:14.456580 543705 disk_worker.go:494] system disk:vda1
I0319 16:41:14.456610 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:41:15.455981 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:41:16.457996 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:41:16.458065 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:41:16.458091 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:41:16.472475 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:41:20.029673 543705 disk_info.go:125] begin check local disk info of client
I0319 16:41:20.032268 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:41:20.032275 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab8c0 0xc0001ab900]
E0319 16:41:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:41:23.409792 543705 memory.go:184] no items to output this cycle
I0319 16:41:23.409812 543705 cpu.go:275] no items to output this cycle
E0319 16:41:33.410705 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:41:33.410721 543705 memory.go:184] no items to output this cycle
I0319 16:41:33.410737 543705 cpu.go:275] no items to output this cycle
E0319 16:41:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:41:43.409803 543705 memory.go:191] Add success.
I0319 16:41:43.409805 543705 cpu.go:282] Add success.
I0319 16:41:43.419886 543705 net.go:648] Add success.
I0319 16:41:43.422602 543705 net.go:770] primary dev: ETH0
I0319 16:41:43.422615 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:41:43.422627 543705 net.go:698] Add success.
I0319 16:41:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:41:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:41:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:41:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:41:53.409787 543705 memory.go:184] no items to output this cycle
I0319 16:41:53.409804 543705 cpu.go:275] no items to output this cycle
E0319 16:42:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:42:03.409786 543705 memory.go:184] no items to output this cycle
I0319 16:42:03.409787 543705 cpu.go:275] no items to output this cycle
E0319 16:42:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:42:13.409797 543705 memory.go:191] Add success.
I0319 16:42:13.409800 543705 cpu.go:282] Add success.
W0319 16:42:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:42:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:42:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:42:13.420205 543705 net.go:648] Add success.
I0319 16:42:13.422888 543705 net.go:770] primary dev: ETH0
I0319 16:42:13.422902 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:42:13.422913 543705 net.go:698] Add success.
I0319 16:42:13.836123 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"37c8d68b-40c8-422b-8102-4927851b3971","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 16:42:13.836156 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 16:42:14.454514 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:42:14.454525 543705 disk_worker.go:708] disk space is not compliant
W0319 16:42:14.454527 543705 disk_worker.go:728] disk inode is not compliant
E0319 16:42:14.455422 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 16:42:14.455431 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 16:42:14.455446 543705 custom_config.go:64] query custom config with name: gpu
I0319 16:42:14.456501 543705 disk_worker.go:494] system disk:vda1
I0319 16:42:14.456533 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 16:42:15.456996 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 16:42:15.457011 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:42:16.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 16:42:16.457991 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 16:42:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:42:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:42:16.472422 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:42:20.033677 543705 disk_info.go:125] begin check local disk info of client
I0319 16:42:20.036116 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:42:20.036122 543705 disk_info.go:196] parse disk info done, disk is : [0xc000349000 0xc000349040]
E0319 16:42:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:42:23.409774 543705 memory.go:184] no items to output this cycle
I0319 16:42:23.409806 543705 cpu.go:275] no items to output this cycle
E0319 16:42:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:42:33.409784 543705 memory.go:184] no items to output this cycle
I0319 16:42:33.409796 543705 cpu.go:275] no items to output this cycle
I0319 16:42:37.773555 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 16:42:37.773561 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 16:42:43.409922 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:42:43.411022 543705 memory.go:191] Add success.
I0319 16:42:43.409973 543705 cpu.go:282] Add success.
I0319 16:42:43.419724 543705 net.go:648] Add success.
I0319 16:42:43.422704 543705 net.go:770] primary dev: ETH0
I0319 16:42:43.422717 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:42:43.422728 543705 net.go:698] Add success.
I0319 16:42:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:42:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:42:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:42:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:42:53.409793 543705 memory.go:184] no items to output this cycle
I0319 16:42:53.409805 543705 cpu.go:275] no items to output this cycle
E0319 16:43:03.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:43:03.409816 543705 memory.go:184] no items to output this cycle
I0319 16:43:03.409817 543705 cpu.go:275] no items to output this cycle
E0319 16:43:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:43:13.409811 543705 memory.go:191] Add success.
I0319 16:43:13.409813 543705 cpu.go:282] Add success.
W0319 16:43:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:43:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:43:13.409855 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:43:13.420214 543705 net.go:648] Add success.
I0319 16:43:13.422904 543705 net.go:770] primary dev: ETH0
I0319 16:43:13.422917 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:43:13.422929 543705 net.go:698] Add success.
I0319 16:43:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:43:14.455158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:43:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0319 16:43:14.455171 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:43:14.456508 543705 disk_worker.go:494] system disk:vda1
I0319 16:43:14.456555 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:43:15.455981 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:43:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:43:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:43:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:43:16.472418 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:43:20.037674 543705 disk_info.go:125] begin check local disk info of client
I0319 16:43:20.040299 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:43:20.040306 543705 disk_info.go:196] parse disk info done, disk is : [0xc000288780 0xc0002887c0]
E0319 16:43:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:43:23.409776 543705 memory.go:184] no items to output this cycle
I0319 16:43:23.409796 543705 cpu.go:275] no items to output this cycle
E0319 16:43:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:43:33.409783 543705 memory.go:184] no items to output this cycle
I0319 16:43:33.409789 543705 cpu.go:275] no items to output this cycle
E0319 16:43:43.409868 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:43:43.409899 543705 memory.go:191] Add success.
I0319 16:43:43.409916 543705 cpu.go:282] Add success.
I0319 16:43:43.419710 543705 net.go:648] Add success.
I0319 16:43:43.422587 543705 net.go:770] primary dev: ETH0
I0319 16:43:43.422600 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:43:43.422612 543705 net.go:698] Add success.
I0319 16:43:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:43:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:43:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:43:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:43:53.409784 543705 memory.go:184] no items to output this cycle
I0319 16:43:53.409790 543705 cpu.go:275] no items to output this cycle
E0319 16:44:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:44:03.409802 543705 memory.go:184] no items to output this cycle
I0319 16:44:03.409816 543705 cpu.go:275] no items to output this cycle
E0319 16:44:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:44:13.409781 543705 memory.go:191] Add success.
W0319 16:44:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 16:44:13.409810 543705 cpu.go:282] Add success.
W0319 16:44:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:44:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:44:13.420160 543705 net.go:648] Add success.
I0319 16:44:13.423032 543705 net.go:770] primary dev: ETH0
I0319 16:44:13.423045 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:44:13.423057 543705 net.go:698] Add success.
I0319 16:44:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:44:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:44:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0319 16:44:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:44:14.456500 543705 disk_worker.go:494] system disk:vda1
I0319 16:44:14.456548 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:44:15.455981 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:44:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:44:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:44:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:44:16.472441 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:44:20.041676 543705 disk_info.go:125] begin check local disk info of client
I0319 16:44:20.044233 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:44:20.044240 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039bf40 0xc00033c000]
E0319 16:44:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:44:23.409808 543705 memory.go:184] no items to output this cycle
I0319 16:44:23.409820 543705 cpu.go:275] no items to output this cycle
E0319 16:44:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:44:33.409798 543705 memory.go:184] no items to output this cycle
I0319 16:44:33.409811 543705 cpu.go:275] no items to output this cycle
E0319 16:44:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:44:43.409796 543705 memory.go:191] Add success.
I0319 16:44:43.409825 543705 cpu.go:282] Add success.
I0319 16:44:43.419715 543705 net.go:648] Add success.
I0319 16:44:43.423139 543705 net.go:770] primary dev: ETH0
I0319 16:44:43.423152 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:44:43.423164 543705 net.go:698] Add success.
I0319 16:44:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:44:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:44:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:44:53.410201 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:44:53.410217 543705 memory.go:184] no items to output this cycle
I0319 16:44:53.410223 543705 cpu.go:275] no items to output this cycle
E0319 16:45:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:45:03.409790 543705 memory.go:184] no items to output this cycle
I0319 16:45:03.409805 543705 cpu.go:275] no items to output this cycle
E0319 16:45:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:45:13.409783 543705 memory.go:191] Add success.
W0319 16:45:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 16:45:13.409814 543705 cpu.go:282] Add success.
W0319 16:45:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:45:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:45:13.420072 543705 net.go:648] Add success.
I0319 16:45:13.423050 543705 net.go:770] primary dev: ETH0
I0319 16:45:13.423063 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:45:13.423076 543705 net.go:698] Add success.
I0319 16:45:13.469320 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d1f81da8-04f4-42ec-894f-471fa1251a2e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 16:45:13.469354 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 16:45:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:45:14.455209 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:45:14.455219 543705 disk_worker.go:708] disk space is not compliant
W0319 16:45:14.455222 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:45:14.456610 543705 disk_worker.go:494] system disk:vda1
I0319 16:45:14.456641 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:45:15.455977 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:45:16.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:45:16.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:45:16.458087 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:45:16.472443 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:45:20.045675 543705 disk_info.go:125] begin check local disk info of client
I0319 16:45:20.048230 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:45:20.048236 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b4800 0xc0002b4840]
E0319 16:45:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:45:23.409796 543705 memory.go:184] no items to output this cycle
I0319 16:45:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 16:45:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:45:33.409782 543705 memory.go:184] no items to output this cycle
I0319 16:45:33.409787 543705 cpu.go:275] no items to output this cycle
I0319 16:45:37.773728 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 16:45:37.773735 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 16:45:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:45:43.410583 543705 memory.go:191] Add success.
I0319 16:45:43.409812 543705 cpu.go:282] Add success.
I0319 16:45:43.419711 543705 net.go:648] Add success.
I0319 16:45:43.422374 543705 net.go:770] primary dev: ETH0
I0319 16:45:43.422387 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:45:43.422398 543705 net.go:698] Add success.
I0319 16:45:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:45:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:45:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:45:53.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:45:53.409811 543705 memory.go:184] no items to output this cycle
I0319 16:45:53.409816 543705 cpu.go:275] no items to output this cycle
E0319 16:46:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:46:03.409770 543705 memory.go:184] no items to output this cycle
I0319 16:46:03.409809 543705 cpu.go:275] no items to output this cycle
E0319 16:46:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:46:13.409807 543705 memory.go:191] Add success.
I0319 16:46:13.409816 543705 cpu.go:282] Add success.
W0319 16:46:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:46:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:46:13.409859 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:46:13.420119 543705 net.go:648] Add success.
I0319 16:46:13.423095 543705 net.go:770] primary dev: ETH0
I0319 16:46:13.423116 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:46:13.423134 543705 net.go:698] Add success.
I0319 16:46:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:46:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:46:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0319 16:46:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:46:14.456510 543705 disk_worker.go:494] system disk:vda1
I0319 16:46:14.456555 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:46:15.455981 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:46:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:46:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:46:16.458087 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:46:16.472442 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:46:20.049670 543705 disk_info.go:125] begin check local disk info of client
I0319 16:46:20.052229 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:46:20.052235 543705 disk_info.go:196] parse disk info done, disk is : [0xc000330e00 0xc000330e40]
E0319 16:46:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:46:23.409770 543705 memory.go:184] no items to output this cycle
I0319 16:46:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 16:46:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:46:33.409797 543705 memory.go:184] no items to output this cycle
I0319 16:46:33.409816 543705 cpu.go:275] no items to output this cycle
E0319 16:46:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:46:43.409825 543705 memory.go:191] Add success.
I0319 16:46:43.409829 543705 cpu.go:282] Add success.
I0319 16:46:43.419807 543705 net.go:770] primary dev: ETH0
I0319 16:46:43.419821 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:46:43.419834 543705 net.go:698] Add success.
I0319 16:46:43.420490 543705 net.go:648] Add success.
I0319 16:46:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:46:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:46:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:46:53.410244 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:46:53.410263 543705 memory.go:184] no items to output this cycle
I0319 16:46:53.410275 543705 cpu.go:275] no items to output this cycle
E0319 16:47:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:47:03.409780 543705 memory.go:184] no items to output this cycle
I0319 16:47:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 16:47:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:47:13.409790 543705 memory.go:191] Add success.
I0319 16:47:13.409790 543705 cpu.go:282] Add success.
W0319 16:47:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:47:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:47:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:47:13.420117 543705 net.go:648] Add success.
I0319 16:47:13.423199 543705 net.go:770] primary dev: ETH0
I0319 16:47:13.423214 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:47:13.423226 543705 net.go:698] Add success.
I0319 16:47:13.452854 543705 event_worker.go:152] Polling the log file for events...
W0319 16:47:14.455246 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:47:14.455262 543705 disk_worker.go:708] disk space is not compliant
W0319 16:47:14.455266 543705 disk_worker.go:728] disk inode is not compliant
E0319 16:47:14.455902 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 16:47:14.455911 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 16:47:14.455917 543705 custom_config.go:64] query custom config with name: gpu
I0319 16:47:14.456825 543705 disk_worker.go:494] system disk:vda1
I0319 16:47:14.456853 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 16:47:15.457026 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 16:47:15.457041 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:47:16.458038 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 16:47:16.458049 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 16:47:16.458092 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:47:16.458110 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:47:16.472485 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:47:20.053672 543705 disk_info.go:125] begin check local disk info of client
I0319 16:47:20.056121 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:47:20.056128 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b7d80 0xc0003b7dc0]
E0319 16:47:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:47:23.409771 543705 memory.go:184] no items to output this cycle
I0319 16:47:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 16:47:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:47:33.409780 543705 memory.go:184] no items to output this cycle
I0319 16:47:33.409786 543705 cpu.go:275] no items to output this cycle
E0319 16:47:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:47:43.409813 543705 memory.go:191] Add success.
I0319 16:47:43.409822 543705 cpu.go:282] Add success.
I0319 16:47:43.419872 543705 net.go:648] Add success.
I0319 16:47:43.422546 543705 net.go:770] primary dev: ETH0
I0319 16:47:43.422561 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:47:43.422575 543705 net.go:698] Add success.
I0319 16:47:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:47:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:47:46.458089 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:47:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:47:53.409790 543705 memory.go:184] no items to output this cycle
I0319 16:47:53.409792 543705 cpu.go:275] no items to output this cycle
E0319 16:48:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:48:03.409783 543705 memory.go:184] no items to output this cycle
I0319 16:48:03.409784 543705 cpu.go:275] no items to output this cycle
E0319 16:48:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:48:13.409788 543705 memory.go:191] Add success.
W0319 16:48:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 16:48:13.409821 543705 cpu.go:282] Add success.
W0319 16:48:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:48:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:48:13.420238 543705 net.go:648] Add success.
I0319 16:48:13.423050 543705 net.go:770] primary dev: ETH0
I0319 16:48:13.423064 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:48:13.423081 543705 net.go:698] Add success.
I0319 16:48:13.469134 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"65639d3f-5f28-403f-b6a5-907e1c61fbde","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 16:48:13.469166 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 16:48:14.454953 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:48:14.455129 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:48:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0319 16:48:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:48:14.456585 543705 disk_worker.go:494] system disk:vda1
I0319 16:48:14.456614 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:48:15.455978 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:48:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:48:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:48:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:48:16.472445 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:48:20.057672 543705 disk_info.go:125] begin check local disk info of client
I0319 16:48:20.060204 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:48:20.060211 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b7380 0xc0003b73c0]
E0319 16:48:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:48:23.409767 543705 memory.go:184] no items to output this cycle
I0319 16:48:23.409792 543705 cpu.go:275] no items to output this cycle
E0319 16:48:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:48:33.409798 543705 memory.go:184] no items to output this cycle
I0319 16:48:33.409810 543705 cpu.go:275] no items to output this cycle
I0319 16:48:37.775574 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 16:48:37.775582 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 16:48:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:48:43.410673 543705 memory.go:191] Add success.
I0319 16:48:43.409811 543705 cpu.go:282] Add success.
I0319 16:48:43.420446 543705 net.go:648] Add success.
I0319 16:48:43.423253 543705 net.go:770] primary dev: ETH0
I0319 16:48:43.423267 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:48:43.423281 543705 net.go:698] Add success.
I0319 16:48:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:48:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:48:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:48:53.409876 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:48:53.409902 543705 memory.go:184] no items to output this cycle
I0319 16:48:53.409924 543705 cpu.go:275] no items to output this cycle
E0319 16:49:03.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:49:03.409760 543705 memory.go:184] no items to output this cycle
I0319 16:49:03.409803 543705 cpu.go:275] no items to output this cycle
E0319 16:49:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:49:13.409802 543705 memory.go:191] Add success.
I0319 16:49:13.409808 543705 cpu.go:282] Add success.
W0319 16:49:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:49:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:49:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:49:13.420052 543705 net.go:648] Add success.
I0319 16:49:13.422996 543705 net.go:770] primary dev: ETH0
I0319 16:49:13.423024 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:49:13.423038 543705 net.go:698] Add success.
I0319 16:49:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:49:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:49:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0319 16:49:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:49:14.456500 543705 disk_worker.go:494] system disk:vda1
I0319 16:49:14.456544 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:49:15.455985 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:49:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:49:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:49:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:49:16.472480 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:49:20.061675 543705 disk_info.go:125] begin check local disk info of client
I0319 16:49:20.064268 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:49:20.064275 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047da80 0xc00047dac0]
E0319 16:49:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:49:23.409773 543705 memory.go:184] no items to output this cycle
I0319 16:49:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 16:49:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:49:33.409780 543705 memory.go:184] no items to output this cycle
I0319 16:49:33.409787 543705 cpu.go:275] no items to output this cycle
E0319 16:49:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:49:43.409779 543705 memory.go:191] Add success.
I0319 16:49:43.409810 543705 cpu.go:282] Add success.
I0319 16:49:43.419976 543705 net.go:648] Add success.
I0319 16:49:43.422981 543705 net.go:770] primary dev: ETH0
I0319 16:49:43.422995 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:49:43.423010 543705 net.go:698] Add success.
I0319 16:49:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:49:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:49:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:49:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:49:53.409773 543705 memory.go:184] no items to output this cycle
I0319 16:49:53.409802 543705 cpu.go:275] no items to output this cycle
E0319 16:50:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:50:03.409900 543705 memory.go:184] no items to output this cycle
I0319 16:50:03.409933 543705 cpu.go:275] no items to output this cycle
E0319 16:50:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:50:13.409777 543705 memory.go:191] Add success.
W0319 16:50:13.409803 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 16:50:13.409810 543705 cpu.go:282] Add success.
W0319 16:50:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:50:13.409817 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:50:13.420266 543705 net.go:648] Add success.
I0319 16:50:13.423319 543705 net.go:770] primary dev: ETH0
I0319 16:50:13.423334 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:50:13.423347 543705 net.go:698] Add success.
I0319 16:50:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:50:14.455153 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:50:14.455163 543705 disk_worker.go:708] disk space is not compliant
W0319 16:50:14.455165 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:50:14.456486 543705 disk_worker.go:494] system disk:vda1
I0319 16:50:14.456529 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:50:15.455992 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:50:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:50:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:50:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:50:16.472476 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:50:20.065678 543705 disk_info.go:125] begin check local disk info of client
I0319 16:50:20.068214 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:50:20.068230 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003566c0 0xc000356700]
E0319 16:50:23.410267 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:50:23.410285 543705 memory.go:184] no items to output this cycle
I0319 16:50:23.410296 543705 cpu.go:275] no items to output this cycle
E0319 16:50:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:50:33.409769 543705 memory.go:184] no items to output this cycle
I0319 16:50:33.409790 543705 cpu.go:275] no items to output this cycle
E0319 16:50:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:50:43.409796 543705 memory.go:191] Add success.
I0319 16:50:43.409798 543705 cpu.go:282] Add success.
I0319 16:50:43.419868 543705 net.go:648] Add success.
I0319 16:50:43.422576 543705 net.go:770] primary dev: ETH0
I0319 16:50:43.422589 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:50:43.422604 543705 net.go:698] Add success.
I0319 16:50:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:50:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:50:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:50:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:50:53.409783 543705 memory.go:184] no items to output this cycle
I0319 16:50:53.409790 543705 cpu.go:275] no items to output this cycle
E0319 16:51:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:51:03.409767 543705 memory.go:184] no items to output this cycle
I0319 16:51:03.409791 543705 cpu.go:275] no items to output this cycle
E0319 16:51:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:51:13.409821 543705 memory.go:191] Add success.
I0319 16:51:13.409831 543705 cpu.go:282] Add success.
W0319 16:51:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:51:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:51:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:51:13.420173 543705 net.go:648] Add success.
I0319 16:51:13.423426 543705 net.go:770] primary dev: ETH0
I0319 16:51:13.423438 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:51:13.423450 543705 net.go:698] Add success.
I0319 16:51:13.464191 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0159ad41-909f-4507-9b96-29e5245e9a7b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 16:51:13.464225 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 16:51:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:51:14.455132 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:51:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0319 16:51:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:51:14.456790 543705 disk_worker.go:494] system disk:vda1
I0319 16:51:14.456819 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:51:15.455991 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:51:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:51:16.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:51:16.458086 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:51:16.472463 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:51:20.069674 543705 disk_info.go:125] begin check local disk info of client
I0319 16:51:20.072281 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:51:20.072287 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4b40 0xc0000c4b80]
E0319 16:51:23.410421 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:51:23.410436 543705 memory.go:184] no items to output this cycle
I0319 16:51:23.410441 543705 cpu.go:275] no items to output this cycle
E0319 16:51:33.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:51:33.409762 543705 memory.go:184] no items to output this cycle
I0319 16:51:33.409799 543705 cpu.go:275] no items to output this cycle
I0319 16:51:37.776570 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 16:51:37.776577 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 16:51:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:51:43.410544 543705 memory.go:191] Add success.
I0319 16:51:43.409805 543705 cpu.go:282] Add success.
I0319 16:51:43.420237 543705 net.go:648] Add success.
I0319 16:51:43.422910 543705 net.go:770] primary dev: ETH0
I0319 16:51:43.422925 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:51:43.422939 543705 net.go:698] Add success.
I0319 16:51:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:51:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:51:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:51:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:51:53.409769 543705 memory.go:184] no items to output this cycle
I0319 16:51:53.409801 543705 cpu.go:275] no items to output this cycle
E0319 16:52:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:52:03.409782 543705 cpu.go:275] no items to output this cycle
I0319 16:52:03.409787 543705 memory.go:184] no items to output this cycle
E0319 16:52:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:52:13.409798 543705 memory.go:191] Add success.
I0319 16:52:13.409800 543705 cpu.go:282] Add success.
W0319 16:52:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:52:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:52:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:52:13.420128 543705 net.go:648] Add success.
I0319 16:52:13.423312 543705 net.go:770] primary dev: ETH0
I0319 16:52:13.423325 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:52:13.423337 543705 net.go:698] Add success.
W0319 16:52:14.455098 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:52:14.455159 543705 disk_worker.go:708] disk space is not compliant
W0319 16:52:14.455162 543705 disk_worker.go:728] disk inode is not compliant
E0319 16:52:14.456914 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 16:52:14.456924 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 16:52:14.456930 543705 custom_config.go:64] query custom config with name: gpu
I0319 16:52:14.456989 543705 disk_worker.go:494] system disk:vda1
I0319 16:52:14.457019 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 16:52:15.456970 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 16:52:15.456989 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:52:16.458099 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 16:52:16.458150 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 16:52:16.458174 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:52:16.458194 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:52:16.472561 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:52:20.073674 543705 disk_info.go:125] begin check local disk info of client
I0319 16:52:20.076123 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:52:20.076128 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4440 0xc0000c4480]
E0319 16:52:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:52:23.409793 543705 memory.go:184] no items to output this cycle
I0319 16:52:23.409806 543705 cpu.go:275] no items to output this cycle
E0319 16:52:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:52:33.409806 543705 memory.go:184] no items to output this cycle
I0319 16:52:33.409812 543705 cpu.go:275] no items to output this cycle
E0319 16:52:43.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:52:43.409832 543705 memory.go:191] Add success.
I0319 16:52:43.409835 543705 cpu.go:282] Add success.
I0319 16:52:43.420120 543705 net.go:648] Add success.
I0319 16:52:43.422850 543705 net.go:770] primary dev: ETH0
I0319 16:52:43.422863 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:52:43.422876 543705 net.go:698] Add success.
I0319 16:52:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:52:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:52:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:52:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:52:53.409766 543705 memory.go:184] no items to output this cycle
I0319 16:52:53.409801 543705 cpu.go:275] no items to output this cycle
E0319 16:53:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:53:03.409799 543705 memory.go:184] no items to output this cycle
I0319 16:53:03.409811 543705 cpu.go:275] no items to output this cycle
E0319 16:53:13.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:53:13.409830 543705 memory.go:191] Add success.
I0319 16:53:13.409840 543705 cpu.go:282] Add success.
W0319 16:53:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:53:13.409877 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:53:13.409881 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:53:13.420182 543705 net.go:648] Add success.
I0319 16:53:13.423041 543705 net.go:770] primary dev: ETH0
I0319 16:53:13.423055 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:53:13.423067 543705 net.go:698] Add success.
I0319 16:53:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:53:14.455132 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:53:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0319 16:53:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:53:14.456603 543705 disk_worker.go:494] system disk:vda1
I0319 16:53:14.456635 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:53:15.455987 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:53:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:53:16.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:53:16.458082 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:53:16.472495 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:53:20.077674 543705 disk_info.go:125] begin check local disk info of client
I0319 16:53:20.080200 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:53:20.080207 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa3c0 0xc0001aa400]
E0319 16:53:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:53:23.409796 543705 memory.go:184] no items to output this cycle
I0319 16:53:23.409809 543705 cpu.go:275] no items to output this cycle
E0319 16:53:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:53:33.409770 543705 memory.go:184] no items to output this cycle
I0319 16:53:33.409794 543705 cpu.go:275] no items to output this cycle
E0319 16:53:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:53:43.409798 543705 memory.go:191] Add success.
I0319 16:53:43.409802 543705 cpu.go:282] Add success.
I0319 16:53:43.419977 543705 net.go:648] Add success.
I0319 16:53:43.422572 543705 net.go:770] primary dev: ETH0
I0319 16:53:43.422587 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:53:43.422601 543705 net.go:698] Add success.
I0319 16:53:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:53:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:53:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:53:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:53:53.409770 543705 memory.go:184] no items to output this cycle
I0319 16:53:53.409791 543705 cpu.go:275] no items to output this cycle
E0319 16:54:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:54:03.409798 543705 memory.go:184] no items to output this cycle
I0319 16:54:03.409810 543705 cpu.go:275] no items to output this cycle
E0319 16:54:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:54:13.409797 543705 memory.go:191] Add success.
I0319 16:54:13.409804 543705 cpu.go:282] Add success.
W0319 16:54:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:54:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:54:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:54:13.420450 543705 net.go:648] Add success.
I0319 16:54:13.423408 543705 net.go:770] primary dev: ETH0
I0319 16:54:13.423421 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:54:13.423433 543705 net.go:698] Add success.
I0319 16:54:13.468572 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4dea14ab-600f-4ecb-8d5c-8357892fb7da","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 16:54:13.468604 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 16:54:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:54:14.455212 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:54:14.455221 543705 disk_worker.go:708] disk space is not compliant
W0319 16:54:14.455224 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:54:14.456638 543705 disk_worker.go:494] system disk:vda1
I0319 16:54:14.456667 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:54:15.456007 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:54:16.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:54:16.458065 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:54:16.458090 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:54:16.472468 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:54:20.081673 543705 disk_info.go:125] begin check local disk info of client
I0319 16:54:20.084212 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:54:20.084219 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab700 0xc0001ab740]
E0319 16:54:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:54:23.409800 543705 memory.go:184] no items to output this cycle
I0319 16:54:23.409810 543705 cpu.go:275] no items to output this cycle
E0319 16:54:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:54:33.409778 543705 memory.go:184] no items to output this cycle
I0319 16:54:33.409783 543705 cpu.go:275] no items to output this cycle
I0319 16:54:37.777588 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 16:54:37.777602 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 16:54:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:54:43.410769 543705 memory.go:191] Add success.
I0319 16:54:43.409835 543705 cpu.go:282] Add success.
I0319 16:54:43.420528 543705 net.go:648] Add success.
I0319 16:54:43.423258 543705 net.go:770] primary dev: ETH0
I0319 16:54:43.423271 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:54:43.423285 543705 net.go:698] Add success.
I0319 16:54:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:54:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:54:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:54:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:54:53.409786 543705 memory.go:184] no items to output this cycle
I0319 16:54:53.409791 543705 cpu.go:275] no items to output this cycle
E0319 16:55:03.409886 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:55:03.409902 543705 memory.go:184] no items to output this cycle
I0319 16:55:03.409919 543705 cpu.go:275] no items to output this cycle
E0319 16:55:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:55:13.409790 543705 memory.go:191] Add success.
I0319 16:55:13.409807 543705 cpu.go:282] Add success.
W0319 16:55:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:55:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:55:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:55:13.420249 543705 net.go:648] Add success.
I0319 16:55:13.423111 543705 net.go:770] primary dev: ETH0
I0319 16:55:13.423125 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:55:13.423136 543705 net.go:698] Add success.
I0319 16:55:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:55:14.455187 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:55:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0319 16:55:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:55:14.456569 543705 disk_worker.go:494] system disk:vda1
I0319 16:55:14.456598 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:55:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:55:16.457998 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:55:16.458073 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:55:16.458102 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:55:16.472483 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:55:20.085692 543705 disk_info.go:125] begin check local disk info of client
I0319 16:55:20.088249 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:55:20.088254 543705 disk_info.go:196] parse disk info done, disk is : [0xc000357b00 0xc000357b40]
E0319 16:55:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:55:23.409777 543705 memory.go:184] no items to output this cycle
I0319 16:55:23.409785 543705 cpu.go:275] no items to output this cycle
E0319 16:55:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:55:33.409777 543705 memory.go:184] no items to output this cycle
I0319 16:55:33.409783 543705 cpu.go:275] no items to output this cycle
E0319 16:55:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:55:43.409788 543705 memory.go:191] Add success.
I0319 16:55:43.409820 543705 cpu.go:282] Add success.
I0319 16:55:43.420030 543705 net.go:648] Add success.
I0319 16:55:43.422712 543705 net.go:770] primary dev: ETH0
I0319 16:55:43.422728 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:55:43.422743 543705 net.go:698] Add success.
I0319 16:55:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:55:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:55:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:55:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:55:53.409782 543705 memory.go:184] no items to output this cycle
I0319 16:55:53.409786 543705 cpu.go:275] no items to output this cycle
E0319 16:56:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:56:03.409790 543705 memory.go:184] no items to output this cycle
I0319 16:56:03.409794 543705 cpu.go:275] no items to output this cycle
E0319 16:56:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:56:13.409794 543705 memory.go:191] Add success.
I0319 16:56:13.409799 543705 cpu.go:282] Add success.
W0319 16:56:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:56:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:56:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:56:13.420073 543705 net.go:648] Add success.
I0319 16:56:13.422967 543705 net.go:770] primary dev: ETH0
I0319 16:56:13.422981 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:56:13.422993 543705 net.go:698] Add success.
I0319 16:56:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:56:14.455124 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:56:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 16:56:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:56:14.456545 543705 disk_worker.go:494] system disk:vda1
I0319 16:56:14.456587 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:56:15.455981 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:56:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:56:16.458068 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:56:16.458094 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:56:16.472493 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:56:20.089687 543705 disk_info.go:125] begin check local disk info of client
I0319 16:56:20.092323 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:56:20.092329 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5200 0xc0000c5240]
E0319 16:56:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:56:23.409769 543705 memory.go:184] no items to output this cycle
I0319 16:56:23.409776 543705 cpu.go:275] no items to output this cycle
E0319 16:56:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:56:33.409778 543705 memory.go:184] no items to output this cycle
I0319 16:56:33.409782 543705 cpu.go:275] no items to output this cycle
E0319 16:56:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:56:43.409802 543705 memory.go:191] Add success.
I0319 16:56:43.409820 543705 cpu.go:282] Add success.
I0319 16:56:43.420061 543705 net.go:648] Add success.
I0319 16:56:43.422741 543705 net.go:770] primary dev: ETH0
I0319 16:56:43.422754 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:56:43.422770 543705 net.go:698] Add success.
I0319 16:56:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:56:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:56:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:56:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:56:53.409780 543705 memory.go:184] no items to output this cycle
I0319 16:56:53.409782 543705 cpu.go:275] no items to output this cycle
E0319 16:57:03.409884 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:57:03.409904 543705 memory.go:184] no items to output this cycle
I0319 16:57:03.409910 543705 cpu.go:275] no items to output this cycle
E0319 16:57:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:57:13.409784 543705 memory.go:191] Add success.
I0319 16:57:13.409810 543705 cpu.go:282] Add success.
W0319 16:57:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:57:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:57:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:57:13.420110 543705 net.go:648] Add success.
I0319 16:57:13.422869 543705 net.go:770] primary dev: ETH0
I0319 16:57:13.422882 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:57:13.422894 543705 net.go:698] Add success.
I0319 16:57:13.429485 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 16:57:13.453721 543705 event_worker.go:152] Polling the log file for events...
I0319 16:57:13.463973 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3c529449-0df6-45d9-bc71-11731a63d77a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 16:57:13.464007 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 16:57:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:57:14.455247 543705 disk_worker.go:708] disk space is not compliant
W0319 16:57:14.455252 543705 disk_worker.go:728] disk inode is not compliant
E0319 16:57:14.455864 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 16:57:14.455872 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 16:57:14.455877 543705 custom_config.go:64] query custom config with name: gpu
I0319 16:57:14.456777 543705 disk_worker.go:494] system disk:vda1
I0319 16:57:14.456806 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 16:57:15.456826 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 16:57:15.456836 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:57:16.458100 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 16:57:16.458147 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 16:57:16.458172 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:57:16.458192 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:57:16.472606 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:57:20.093675 543705 disk_info.go:125] begin check local disk info of client
I0319 16:57:20.096168 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:57:20.096174 543705 disk_info.go:196] parse disk info done, disk is : [0xc000252800 0xc000252840]
E0319 16:57:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:57:23.409765 543705 memory.go:184] no items to output this cycle
I0319 16:57:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 16:57:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:57:33.409801 543705 memory.go:184] no items to output this cycle
I0319 16:57:33.409813 543705 cpu.go:275] no items to output this cycle
I0319 16:57:37.777734 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 16:57:37.777741 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 16:57:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:57:43.410705 543705 memory.go:191] Add success.
I0319 16:57:43.409800 543705 cpu.go:282] Add success.
I0319 16:57:43.420390 543705 net.go:648] Add success.
I0319 16:57:43.423130 543705 net.go:770] primary dev: ETH0
I0319 16:57:43.423142 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:57:43.423158 543705 net.go:698] Add success.
I0319 16:57:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:57:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:57:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:57:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:57:53.409791 543705 memory.go:184] no items to output this cycle
I0319 16:57:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 16:58:03.409895 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:58:03.409905 543705 cpu.go:275] no items to output this cycle
I0319 16:58:03.409916 543705 memory.go:184] no items to output this cycle
E0319 16:58:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:58:13.409779 543705 memory.go:191] Add success.
W0319 16:58:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:58:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:58:13.409818 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:58:13.409850 543705 cpu.go:282] Add success.
I0319 16:58:13.420509 543705 net.go:648] Add success.
I0319 16:58:13.423373 543705 net.go:770] primary dev: ETH0
I0319 16:58:13.423391 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:58:13.423410 543705 net.go:698] Add success.
I0319 16:58:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:58:14.455230 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:58:14.455244 543705 disk_worker.go:708] disk space is not compliant
W0319 16:58:14.455246 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:58:14.456611 543705 disk_worker.go:494] system disk:vda1
I0319 16:58:14.456641 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:58:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:58:16.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:58:16.458065 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:58:16.458093 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:58:16.472504 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:58:20.097674 543705 disk_info.go:125] begin check local disk info of client
I0319 16:58:20.100241 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:58:20.100247 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007adc0 0xc00007ae00]
E0319 16:58:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:58:23.409800 543705 memory.go:184] no items to output this cycle
I0319 16:58:23.409811 543705 cpu.go:275] no items to output this cycle
E0319 16:58:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:58:33.409772 543705 memory.go:184] no items to output this cycle
I0319 16:58:33.409792 543705 cpu.go:275] no items to output this cycle
E0319 16:58:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:58:43.409829 543705 memory.go:191] Add success.
I0319 16:58:43.409832 543705 cpu.go:282] Add success.
I0319 16:58:43.420002 543705 net.go:648] Add success.
I0319 16:58:43.422706 543705 net.go:770] primary dev: ETH0
I0319 16:58:43.422721 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:58:43.422734 543705 net.go:698] Add success.
I0319 16:58:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:58:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:58:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:58:53.409776 543705 cpu.go:275] no items to output this cycle
E0319 16:58:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:58:53.409790 543705 memory.go:184] no items to output this cycle
E0319 16:59:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:59:03.409805 543705 memory.go:184] no items to output this cycle
I0319 16:59:03.409810 543705 cpu.go:275] no items to output this cycle
E0319 16:59:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:59:13.409791 543705 memory.go:191] Add success.
I0319 16:59:13.409807 543705 cpu.go:282] Add success.
W0319 16:59:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 16:59:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 16:59:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 16:59:13.420115 543705 net.go:648] Add success.
I0319 16:59:13.422788 543705 net.go:770] primary dev: ETH0
I0319 16:59:13.422801 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:59:13.422813 543705 net.go:698] Add success.
I0319 16:59:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 16:59:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 16:59:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0319 16:59:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0319 16:59:14.456589 543705 disk_worker.go:494] system disk:vda1
I0319 16:59:14.456617 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 16:59:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 16:59:16.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:59:16.458064 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:59:16.458090 543705 gpu_kunlun.go:227] Add success, len:1
I0319 16:59:16.472475 543705 disk_local_worker.go:436] Get disk info: []
I0319 16:59:20.101678 543705 disk_info.go:125] begin check local disk info of client
I0319 16:59:20.104198 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 16:59:20.104205 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b69c0 0xc0003b6a00]
E0319 16:59:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:59:23.409798 543705 memory.go:184] no items to output this cycle
I0319 16:59:23.409811 543705 cpu.go:275] no items to output this cycle
E0319 16:59:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:59:33.409770 543705 memory.go:184] no items to output this cycle
I0319 16:59:33.409797 543705 cpu.go:275] no items to output this cycle
E0319 16:59:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:59:43.409794 543705 memory.go:191] Add success.
I0319 16:59:43.409808 543705 cpu.go:282] Add success.
I0319 16:59:43.419908 543705 net.go:648] Add success.
I0319 16:59:43.422725 543705 net.go:770] primary dev: ETH0
I0319 16:59:43.422738 543705 net.go:802] Send network stats successfully!,count is 6
I0319 16:59:43.422749 543705 net.go:698] Add success.
I0319 16:59:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 16:59:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 16:59:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 16:59:53.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 16:59:53.409812 543705 memory.go:184] no items to output this cycle
I0319 16:59:53.409821 543705 cpu.go:275] no items to output this cycle
E0319 17:00:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:00:03.409779 543705 memory.go:184] no items to output this cycle
I0319 17:00:03.409786 543705 cpu.go:275] no items to output this cycle
E0319 17:00:13.409885 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:00:13.409912 543705 memory.go:191] Add success.
W0319 17:00:13.409940 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:00:13.409956 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:00:13.409959 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:00:13.410089 543705 cpu.go:282] Add success.
I0319 17:00:13.419713 543705 net.go:648] Add success.
I0319 17:00:13.422412 543705 net.go:770] primary dev: ETH0
I0319 17:00:13.422427 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:00:13.422441 543705 net.go:698] Add success.
I0319 17:00:13.512878 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b05e35d1-402c-479f-bab4-ddae3e8c620a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 17:00:13.512909 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 17:00:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:00:14.455097 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:00:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0319 17:00:14.455161 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:00:14.456476 543705 disk_worker.go:494] system disk:vda1
I0319 17:00:14.456518 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:00:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:00:16.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:00:16.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:00:16.458086 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:00:16.472488 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:00:20.105677 543705 disk_info.go:125] begin check local disk info of client
I0319 17:00:20.108171 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:00:20.108177 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ab6c0 0xc0003ab700]
E0319 17:00:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:00:23.409794 543705 memory.go:184] no items to output this cycle
I0319 17:00:23.409808 543705 cpu.go:275] no items to output this cycle
E0319 17:00:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:00:33.409787 543705 memory.go:184] no items to output this cycle
I0319 17:00:33.409788 543705 cpu.go:275] no items to output this cycle
I0319 17:00:37.777882 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 17:00:37.777889 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 17:00:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:00:43.410758 543705 memory.go:191] Add success.
I0319 17:00:43.409823 543705 cpu.go:282] Add success.
I0319 17:00:43.420644 543705 net.go:648] Add success.
I0319 17:00:43.423412 543705 net.go:770] primary dev: ETH0
I0319 17:00:43.423426 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:00:43.423438 543705 net.go:698] Add success.
I0319 17:00:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:00:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:00:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:00:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:00:53.409797 543705 memory.go:184] no items to output this cycle
I0319 17:00:53.409806 543705 cpu.go:275] no items to output this cycle
E0319 17:01:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:01:03.409769 543705 memory.go:184] no items to output this cycle
I0319 17:01:03.409799 543705 cpu.go:275] no items to output this cycle
E0319 17:01:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:01:13.409816 543705 memory.go:191] Add success.
I0319 17:01:13.409823 543705 cpu.go:282] Add success.
W0319 17:01:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:01:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:01:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:01:13.419952 543705 net.go:770] primary dev: ETH0
I0319 17:01:13.419965 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:01:13.419979 543705 net.go:698] Add success.
I0319 17:01:13.420451 543705 net.go:648] Add success.
I0319 17:01:14.454947 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:01:14.455095 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:01:14.455159 543705 disk_worker.go:708] disk space is not compliant
W0319 17:01:14.455162 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:01:14.456495 543705 disk_worker.go:494] system disk:vda1
I0319 17:01:14.456537 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:01:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:01:16.458002 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:01:16.458074 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:01:16.458103 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:01:16.472471 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:01:20.109674 543705 disk_info.go:125] begin check local disk info of client
I0319 17:01:20.112225 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:01:20.112231 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b2440 0xc0003b2480]
E0319 17:01:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:01:23.409799 543705 memory.go:184] no items to output this cycle
I0319 17:01:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 17:01:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:01:33.409779 543705 memory.go:184] no items to output this cycle
I0319 17:01:33.409786 543705 cpu.go:275] no items to output this cycle
E0319 17:01:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:01:43.409802 543705 memory.go:191] Add success.
I0319 17:01:43.409802 543705 cpu.go:282] Add success.
I0319 17:01:43.420071 543705 net.go:648] Add success.
I0319 17:01:43.422837 543705 net.go:770] primary dev: ETH0
I0319 17:01:43.422852 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:01:43.422867 543705 net.go:698] Add success.
I0319 17:01:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:01:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:01:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:01:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:01:53.409804 543705 memory.go:184] no items to output this cycle
I0319 17:01:53.409809 543705 cpu.go:275] no items to output this cycle
E0319 17:02:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:02:03.409768 543705 memory.go:184] no items to output this cycle
I0319 17:02:03.409802 543705 cpu.go:275] no items to output this cycle
E0319 17:02:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:02:13.409793 543705 memory.go:191] Add success.
I0319 17:02:13.409793 543705 cpu.go:282] Add success.
W0319 17:02:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:02:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:02:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:02:13.420146 543705 net.go:648] Add success.
I0319 17:02:13.423177 543705 net.go:770] primary dev: ETH0
I0319 17:02:13.423190 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:02:13.423203 543705 net.go:698] Add success.
W0319 17:02:14.455282 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:02:14.455404 543705 disk_worker.go:708] disk space is not compliant
W0319 17:02:14.455407 543705 disk_worker.go:728] disk inode is not compliant
E0319 17:02:14.458899 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 17:02:14.458907 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 17:02:14.458911 543705 custom_config.go:64] query custom config with name: gpu
I0319 17:02:14.459104 543705 disk_worker.go:494] system disk:vda1
I0319 17:02:14.459131 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 17:02:15.456810 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 17:02:15.456819 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:02:16.458102 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 17:02:16.458140 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 17:02:16.458175 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:02:16.458195 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:02:16.472539 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:02:20.113674 543705 disk_info.go:125] begin check local disk info of client
I0319 17:02:20.116255 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:02:20.116261 543705 disk_info.go:196] parse disk info done, disk is : [0xc00049c400 0xc00049c440]
E0319 17:02:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:02:23.409788 543705 memory.go:184] no items to output this cycle
I0319 17:02:23.409788 543705 cpu.go:275] no items to output this cycle
E0319 17:02:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:02:33.409795 543705 memory.go:184] no items to output this cycle
I0319 17:02:33.409813 543705 cpu.go:275] no items to output this cycle
E0319 17:02:43.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:02:43.409835 543705 memory.go:191] Add success.
I0319 17:02:43.409840 543705 cpu.go:282] Add success.
I0319 17:02:43.420396 543705 net.go:648] Add success.
I0319 17:02:43.423501 543705 net.go:770] primary dev: ETH0
I0319 17:02:43.423514 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:02:43.423536 543705 net.go:698] Add success.
I0319 17:02:46.458393 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:02:46.458471 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:02:46.458494 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:02:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:02:53.409799 543705 memory.go:184] no items to output this cycle
I0319 17:02:53.409808 543705 cpu.go:275] no items to output this cycle
E0319 17:03:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:03:03.409800 543705 memory.go:184] no items to output this cycle
I0319 17:03:03.409811 543705 cpu.go:275] no items to output this cycle
E0319 17:03:13.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:03:13.409805 543705 cpu.go:282] Add success.
I0319 17:03:13.409828 543705 memory.go:191] Add success.
W0319 17:03:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:03:13.409884 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:03:13.409888 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:03:13.420498 543705 net.go:648] Add success.
I0319 17:03:13.423407 543705 net.go:770] primary dev: ETH0
I0319 17:03:13.423426 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:03:13.423444 543705 net.go:698] Add success.
I0319 17:03:13.958621 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"6662ce26-c59c-4519-b892-87d00d4a3992","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 17:03:13.958660 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 17:03:14.453965 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:03:14.455216 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:03:14.455227 543705 disk_worker.go:708] disk space is not compliant
W0319 17:03:14.455229 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:03:14.456545 543705 disk_worker.go:494] system disk:vda1
I0319 17:03:14.456587 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:03:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:03:16.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:03:16.458064 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:03:16.458092 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:03:16.472440 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:03:20.117671 543705 disk_info.go:125] begin check local disk info of client
I0319 17:03:20.120234 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:03:20.120241 543705 disk_info.go:196] parse disk info done, disk is : [0xc0005752c0 0xc000575300]
E0319 17:03:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:03:23.409766 543705 memory.go:184] no items to output this cycle
I0319 17:03:23.409833 543705 cpu.go:275] no items to output this cycle
E0319 17:03:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:03:33.409801 543705 memory.go:184] no items to output this cycle
I0319 17:03:33.409811 543705 cpu.go:275] no items to output this cycle
I0319 17:03:37.778027 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 17:03:37.778034 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 17:03:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:03:43.410687 543705 memory.go:191] Add success.
I0319 17:03:43.409800 543705 cpu.go:282] Add success.
I0319 17:03:43.420398 543705 net.go:648] Add success.
I0319 17:03:43.423085 543705 net.go:770] primary dev: ETH0
I0319 17:03:43.423098 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:03:43.423112 543705 net.go:698] Add success.
I0319 17:03:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:03:46.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:03:46.458055 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:03:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:03:53.409780 543705 memory.go:184] no items to output this cycle
I0319 17:03:53.409782 543705 cpu.go:275] no items to output this cycle
E0319 17:04:03.409899 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:04:03.409925 543705 memory.go:184] no items to output this cycle
I0319 17:04:03.409932 543705 cpu.go:275] no items to output this cycle
E0319 17:04:13.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:04:13.409828 543705 memory.go:191] Add success.
I0319 17:04:13.409833 543705 cpu.go:282] Add success.
W0319 17:04:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:04:13.409876 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:04:13.409879 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:04:13.420094 543705 net.go:648] Add success.
I0319 17:04:13.422676 543705 net.go:770] primary dev: ETH0
I0319 17:04:13.422689 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:04:13.422701 543705 net.go:698] Add success.
I0319 17:04:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:04:14.455206 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:04:14.455217 543705 disk_worker.go:708] disk space is not compliant
W0319 17:04:14.455220 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:04:14.456605 543705 disk_worker.go:494] system disk:vda1
I0319 17:04:14.456636 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:04:15.455975 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:04:16.457999 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:04:16.458068 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:04:16.458098 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:04:16.472484 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:04:20.121683 543705 disk_info.go:125] begin check local disk info of client
I0319 17:04:20.124225 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:04:20.124232 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa680 0xc0001aa6c0]
E0319 17:04:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:04:23.409777 543705 memory.go:184] no items to output this cycle
I0319 17:04:23.409839 543705 cpu.go:275] no items to output this cycle
E0319 17:04:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:04:33.409777 543705 memory.go:184] no items to output this cycle
I0319 17:04:33.409810 543705 cpu.go:275] no items to output this cycle
E0319 17:04:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:04:43.409817 543705 memory.go:191] Add success.
I0319 17:04:43.409822 543705 cpu.go:282] Add success.
I0319 17:04:43.419717 543705 net.go:770] primary dev: ETH0
I0319 17:04:43.419733 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:04:43.419748 543705 net.go:698] Add success.
I0319 17:04:43.420128 543705 net.go:648] Add success.
I0319 17:04:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:04:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:04:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:04:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:04:53.409793 543705 memory.go:184] no items to output this cycle
I0319 17:04:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 17:05:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:05:03.409790 543705 memory.go:184] no items to output this cycle
I0319 17:05:03.409803 543705 cpu.go:275] no items to output this cycle
E0319 17:05:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:05:13.409823 543705 memory.go:191] Add success.
I0319 17:05:13.409828 543705 cpu.go:282] Add success.
W0319 17:05:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:05:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:05:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:05:13.420046 543705 net.go:770] primary dev: ETH0
I0319 17:05:13.420060 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:05:13.420072 543705 net.go:698] Add success.
I0319 17:05:13.420306 543705 net.go:648] Add success.
I0319 17:05:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:05:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:05:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0319 17:05:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:05:14.456472 543705 disk_worker.go:494] system disk:vda1
I0319 17:05:14.456516 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:05:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:05:16.458016 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:05:16.458090 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:05:16.458120 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:05:16.472532 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:05:20.125672 543705 disk_info.go:125] begin check local disk info of client
I0319 17:05:20.128227 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:05:20.128233 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003499c0 0xc000349a00]
E0319 17:05:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:05:23.409806 543705 memory.go:184] no items to output this cycle
I0319 17:05:23.409821 543705 cpu.go:275] no items to output this cycle
E0319 17:05:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:05:33.409789 543705 memory.go:184] no items to output this cycle
I0319 17:05:33.409808 543705 cpu.go:275] no items to output this cycle
E0319 17:05:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:05:43.409815 543705 memory.go:191] Add success.
I0319 17:05:43.409815 543705 cpu.go:282] Add success.
I0319 17:05:43.419879 543705 net.go:648] Add success.
I0319 17:05:43.422940 543705 net.go:770] primary dev: ETH0
I0319 17:05:43.422952 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:05:43.422964 543705 net.go:698] Add success.
I0319 17:05:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:05:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:05:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:05:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:05:53.409781 543705 memory.go:184] no items to output this cycle
I0319 17:05:53.409785 543705 cpu.go:275] no items to output this cycle
E0319 17:06:03.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:06:03.409765 543705 memory.go:184] no items to output this cycle
I0319 17:06:03.409800 543705 cpu.go:275] no items to output this cycle
E0319 17:06:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:06:13.409810 543705 memory.go:191] Add success.
I0319 17:06:13.409816 543705 cpu.go:282] Add success.
W0319 17:06:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:06:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:06:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:06:13.420218 543705 net.go:648] Add success.
I0319 17:06:13.422934 543705 net.go:770] primary dev: ETH0
I0319 17:06:13.422947 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:06:13.422958 543705 net.go:698] Add success.
I0319 17:06:13.469669 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3e944744-d94b-4089-b5b4-54fc64ca0409","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 17:06:13.469704 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 17:06:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:06:14.455212 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:06:14.455222 543705 disk_worker.go:708] disk space is not compliant
W0319 17:06:14.455225 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:06:14.456596 543705 disk_worker.go:494] system disk:vda1
I0319 17:06:14.456626 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:06:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:06:16.458009 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:06:16.458081 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:06:16.458113 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:06:16.472513 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:06:20.129676 543705 disk_info.go:125] begin check local disk info of client
I0319 17:06:20.132236 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:06:20.132242 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5e00 0xc0000c5e40]
E0319 17:06:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:06:23.409798 543705 memory.go:184] no items to output this cycle
I0319 17:06:23.409809 543705 cpu.go:275] no items to output this cycle
E0319 17:06:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:06:33.409792 543705 memory.go:184] no items to output this cycle
I0319 17:06:33.409810 543705 cpu.go:275] no items to output this cycle
I0319 17:06:37.779594 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 17:06:37.779600 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 17:06:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:06:43.410705 543705 memory.go:191] Add success.
I0319 17:06:43.409834 543705 cpu.go:282] Add success.
I0319 17:06:43.420438 543705 net.go:648] Add success.
I0319 17:06:43.423128 543705 net.go:770] primary dev: ETH0
I0319 17:06:43.423140 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:06:43.423153 543705 net.go:698] Add success.
I0319 17:06:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:06:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:06:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:06:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:06:53.409775 543705 memory.go:184] no items to output this cycle
I0319 17:06:53.409794 543705 cpu.go:275] no items to output this cycle
E0319 17:07:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:07:03.409770 543705 memory.go:184] no items to output this cycle
I0319 17:07:03.409805 543705 cpu.go:275] no items to output this cycle
E0319 17:07:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:07:13.409811 543705 memory.go:191] Add success.
I0319 17:07:13.409817 543705 cpu.go:282] Add success.
W0319 17:07:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:07:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:07:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:07:13.420119 543705 net.go:648] Add success.
I0319 17:07:13.423183 543705 net.go:770] primary dev: ETH0
I0319 17:07:13.423196 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:07:13.423209 543705 net.go:698] Add success.
I0319 17:07:13.452862 543705 event_worker.go:152] Polling the log file for events...
W0319 17:07:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:07:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0319 17:07:14.455169 543705 disk_worker.go:728] disk inode is not compliant
E0319 17:07:14.456994 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 17:07:14.457004 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 17:07:14.457010 543705 custom_config.go:64] query custom config with name: gpu
I0319 17:07:14.457027 543705 disk_worker.go:494] system disk:vda1
I0319 17:07:14.457057 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 17:07:15.456791 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 17:07:15.456800 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:07:16.458120 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:07:16.458193 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:07:16.458219 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:07:16.458222 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 17:07:16.472693 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:07:20.133677 543705 disk_info.go:125] begin check local disk info of client
I0319 17:07:20.136205 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:07:20.136212 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035c740 0xc00035c780]
E0319 17:07:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:07:23.409777 543705 memory.go:184] no items to output this cycle
I0319 17:07:23.409780 543705 cpu.go:275] no items to output this cycle
E0319 17:07:33.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:07:33.409820 543705 memory.go:184] no items to output this cycle
I0319 17:07:33.409836 543705 cpu.go:275] no items to output this cycle
E0319 17:07:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:07:43.409778 543705 memory.go:191] Add success.
I0319 17:07:43.409797 543705 cpu.go:282] Add success.
I0319 17:07:43.419894 543705 net.go:648] Add success.
I0319 17:07:43.422674 543705 net.go:770] primary dev: ETH0
I0319 17:07:43.422687 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:07:43.422699 543705 net.go:698] Add success.
I0319 17:07:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:07:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:07:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:07:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:07:53.409777 543705 memory.go:184] no items to output this cycle
I0319 17:07:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 17:08:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:08:03.409773 543705 memory.go:184] no items to output this cycle
I0319 17:08:03.409790 543705 cpu.go:275] no items to output this cycle
E0319 17:08:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:08:13.409780 543705 memory.go:191] Add success.
I0319 17:08:13.409801 543705 cpu.go:282] Add success.
W0319 17:08:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:08:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:08:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:08:13.420069 543705 net.go:648] Add success.
I0319 17:08:13.423509 543705 net.go:770] primary dev: ETH0
I0319 17:08:13.423522 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:08:13.423534 543705 net.go:698] Add success.
I0319 17:08:14.454981 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:08:14.455133 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:08:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0319 17:08:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:08:14.456516 543705 disk_worker.go:494] system disk:vda1
I0319 17:08:14.456565 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:08:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:08:16.457999 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:08:16.458072 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:08:16.458100 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:08:16.472523 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:08:20.137678 543705 disk_info.go:125] begin check local disk info of client
I0319 17:08:20.140240 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:08:20.140247 543705 disk_info.go:196] parse disk info done, disk is : [0xc000273080 0xc0002730c0]
E0319 17:08:23.409914 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:08:23.409932 543705 memory.go:184] no items to output this cycle
I0319 17:08:23.410029 543705 cpu.go:275] no items to output this cycle
E0319 17:08:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:08:33.409787 543705 memory.go:184] no items to output this cycle
I0319 17:08:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 17:08:43.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:08:43.409829 543705 memory.go:191] Add success.
I0319 17:08:43.409839 543705 cpu.go:282] Add success.
I0319 17:08:43.419816 543705 net.go:770] primary dev: ETH0
I0319 17:08:43.419832 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:08:43.419847 543705 net.go:698] Add success.
I0319 17:08:43.420209 543705 net.go:648] Add success.
I0319 17:08:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:08:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:08:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:08:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:08:53.409776 543705 cpu.go:275] no items to output this cycle
I0319 17:08:53.409789 543705 memory.go:184] no items to output this cycle
E0319 17:09:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:09:03.409781 543705 cpu.go:275] no items to output this cycle
I0319 17:09:03.409787 543705 memory.go:184] no items to output this cycle
E0319 17:09:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:09:13.409792 543705 memory.go:191] Add success.
I0319 17:09:13.409798 543705 cpu.go:282] Add success.
W0319 17:09:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:09:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:09:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:09:13.420041 543705 net.go:648] Add success.
I0319 17:09:13.422928 543705 net.go:770] primary dev: ETH0
I0319 17:09:13.422942 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:09:13.422953 543705 net.go:698] Add success.
I0319 17:09:13.463766 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f4a332df-56e8-4514-b2d9-30080bf5713d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 17:09:13.463802 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 17:09:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:09:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:09:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0319 17:09:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:09:14.456589 543705 disk_worker.go:494] system disk:vda1
I0319 17:09:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:09:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:09:16.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:09:16.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:09:16.458096 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:09:16.472522 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:09:20.141674 543705 disk_info.go:125] begin check local disk info of client
I0319 17:09:20.144216 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:09:20.144222 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032a2c0 0xc00032a300]
E0319 17:09:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:09:23.409801 543705 memory.go:184] no items to output this cycle
I0319 17:09:23.409812 543705 cpu.go:275] no items to output this cycle
E0319 17:09:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:09:33.409773 543705 memory.go:184] no items to output this cycle
I0319 17:09:33.409813 543705 cpu.go:275] no items to output this cycle
I0319 17:09:37.780594 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 17:09:37.780601 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 17:09:43.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:09:43.410790 543705 memory.go:191] Add success.
I0319 17:09:43.409809 543705 cpu.go:282] Add success.
I0319 17:09:43.420493 543705 net.go:648] Add success.
I0319 17:09:43.423401 543705 net.go:770] primary dev: ETH0
I0319 17:09:43.423415 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:09:43.423427 543705 net.go:698] Add success.
I0319 17:09:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:09:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:09:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:09:53.410242 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:09:53.410259 543705 memory.go:184] no items to output this cycle
I0319 17:09:53.410290 543705 cpu.go:275] no items to output this cycle
E0319 17:10:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:10:03.409794 543705 memory.go:184] no items to output this cycle
I0319 17:10:03.409812 543705 cpu.go:275] no items to output this cycle
E0319 17:10:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:10:13.409782 543705 memory.go:191] Add success.
I0319 17:10:13.409799 543705 cpu.go:282] Add success.
W0319 17:10:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:10:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:10:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:10:13.420144 543705 net.go:648] Add success.
I0319 17:10:13.422966 543705 net.go:770] primary dev: ETH0
I0319 17:10:13.422979 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:10:13.422991 543705 net.go:698] Add success.
I0319 17:10:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:10:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:10:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0319 17:10:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:10:14.456565 543705 disk_worker.go:494] system disk:vda1
I0319 17:10:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:10:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:10:16.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:10:16.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:10:16.458092 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:10:16.472537 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:10:20.145679 543705 disk_info.go:125] begin check local disk info of client
I0319 17:10:20.148235 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:10:20.148241 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000e6080 0xc0000e60c0]
E0319 17:10:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:10:23.409782 543705 memory.go:184] no items to output this cycle
I0319 17:10:23.409783 543705 cpu.go:275] no items to output this cycle
E0319 17:10:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:10:33.409804 543705 memory.go:184] no items to output this cycle
I0319 17:10:33.409815 543705 cpu.go:275] no items to output this cycle
E0319 17:10:43.409855 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:10:43.409882 543705 memory.go:191] Add success.
I0319 17:10:43.409913 543705 cpu.go:282] Add success.
I0319 17:10:43.420066 543705 net.go:648] Add success.
I0319 17:10:43.423023 543705 net.go:770] primary dev: ETH0
I0319 17:10:43.423036 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:10:43.423048 543705 net.go:698] Add success.
I0319 17:10:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:10:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:10:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:10:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:10:53.409776 543705 memory.go:184] no items to output this cycle
I0319 17:10:53.409780 543705 cpu.go:275] no items to output this cycle
E0319 17:11:03.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:11:03.409763 543705 memory.go:184] no items to output this cycle
I0319 17:11:03.409797 543705 cpu.go:275] no items to output this cycle
E0319 17:11:13.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:11:13.409828 543705 memory.go:191] Add success.
I0319 17:11:13.409828 543705 cpu.go:282] Add success.
W0319 17:11:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:11:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:11:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:11:13.420169 543705 net.go:648] Add success.
I0319 17:11:13.423212 543705 net.go:770] primary dev: ETH0
I0319 17:11:13.423229 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:11:13.423244 543705 net.go:698] Add success.
I0319 17:11:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:11:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:11:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0319 17:11:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:11:14.456557 543705 disk_worker.go:494] system disk:vda1
I0319 17:11:14.456589 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:11:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:11:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:11:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:11:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:11:16.472518 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:11:20.149668 543705 disk_info.go:125] begin check local disk info of client
I0319 17:11:20.152297 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:11:20.152305 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b3680 0xc0003b36c0]
E0319 17:11:23.409892 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:11:23.409912 543705 memory.go:184] no items to output this cycle
I0319 17:11:23.409984 543705 cpu.go:275] no items to output this cycle
E0319 17:11:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:11:33.409796 543705 memory.go:184] no items to output this cycle
I0319 17:11:33.409811 543705 cpu.go:275] no items to output this cycle
E0319 17:11:43.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:11:43.409786 543705 memory.go:191] Add success.
I0319 17:11:43.409818 543705 cpu.go:282] Add success.
I0319 17:11:43.419966 543705 net.go:648] Add success.
I0319 17:11:43.422491 543705 net.go:770] primary dev: ETH0
I0319 17:11:43.422505 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:11:43.422517 543705 net.go:698] Add success.
I0319 17:11:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:11:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:11:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:11:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:11:53.409771 543705 memory.go:184] no items to output this cycle
I0319 17:11:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 17:12:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:12:03.409793 543705 memory.go:184] no items to output this cycle
I0319 17:12:03.409816 543705 cpu.go:275] no items to output this cycle
E0319 17:12:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:12:13.409778 543705 memory.go:191] Add success.
W0319 17:12:13.409803 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 17:12:13.409803 543705 cpu.go:282] Add success.
W0319 17:12:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:12:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:12:13.420089 543705 net.go:648] Add success.
I0319 17:12:13.422622 543705 net.go:770] primary dev: ETH0
I0319 17:12:13.422637 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:12:13.422651 543705 net.go:698] Add success.
I0319 17:12:13.464006 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"6fe8d001-6744-47fe-920d-70f270d7a2e0","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 17:12:13.464043 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 17:12:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:12:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0319 17:12:14.455198 543705 disk_worker.go:728] disk inode is not compliant
E0319 17:12:14.455895 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 17:12:14.455904 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 17:12:14.455910 543705 custom_config.go:64] query custom config with name: gpu
I0319 17:12:14.456576 543705 disk_worker.go:494] system disk:vda1
I0319 17:12:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 17:12:15.456862 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 17:12:15.456871 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:12:16.458005 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 17:12:16.458007 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 17:12:16.458066 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:12:16.458090 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:12:16.472502 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:12:20.153679 543705 disk_info.go:125] begin check local disk info of client
I0319 17:12:20.156286 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:12:20.156293 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b0380 0xc0002b03c0]
E0319 17:12:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:12:23.409780 543705 memory.go:184] no items to output this cycle
I0319 17:12:23.409782 543705 cpu.go:275] no items to output this cycle
E0319 17:12:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:12:33.409776 543705 memory.go:184] no items to output this cycle
I0319 17:12:33.409798 543705 cpu.go:275] no items to output this cycle
I0319 17:12:37.781593 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 17:12:37.781600 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 17:12:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:12:43.410922 543705 memory.go:191] Add success.
I0319 17:12:43.409874 543705 cpu.go:282] Add success.
I0319 17:12:43.420663 543705 net.go:648] Add success.
I0319 17:12:43.423339 543705 net.go:770] primary dev: ETH0
I0319 17:12:43.423355 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:12:43.423372 543705 net.go:698] Add success.
I0319 17:12:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:12:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:12:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:12:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:12:53.409776 543705 memory.go:184] no items to output this cycle
I0319 17:12:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 17:13:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:13:03.409799 543705 memory.go:184] no items to output this cycle
I0319 17:13:03.409809 543705 cpu.go:275] no items to output this cycle
E0319 17:13:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:13:13.409818 543705 memory.go:191] Add success.
I0319 17:13:13.409822 543705 cpu.go:282] Add success.
W0319 17:13:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:13:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:13:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:13:13.420124 543705 net.go:648] Add success.
I0319 17:13:13.422940 543705 net.go:770] primary dev: ETH0
I0319 17:13:13.422953 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:13:13.422965 543705 net.go:698] Add success.
I0319 17:13:14.453929 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:13:14.455211 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:13:14.455222 543705 disk_worker.go:708] disk space is not compliant
W0319 17:13:14.455225 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:13:14.457431 543705 disk_worker.go:494] system disk:vda1
I0319 17:13:14.457466 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:13:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:13:16.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:13:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:13:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:13:16.472506 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:13:20.157677 543705 disk_info.go:125] begin check local disk info of client
I0319 17:13:20.160391 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:13:20.160398 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b0480 0xc0002b04c0]
E0319 17:13:23.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:13:23.409808 543705 memory.go:184] no items to output this cycle
I0319 17:13:23.409820 543705 cpu.go:275] no items to output this cycle
E0319 17:13:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:13:33.409777 543705 memory.go:184] no items to output this cycle
I0319 17:13:33.409808 543705 cpu.go:275] no items to output this cycle
E0319 17:13:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:13:43.409802 543705 memory.go:191] Add success.
I0319 17:13:43.409817 543705 cpu.go:282] Add success.
I0319 17:13:43.419863 543705 net.go:648] Add success.
I0319 17:13:43.422514 543705 net.go:770] primary dev: ETH0
I0319 17:13:43.422526 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:13:43.422541 543705 net.go:698] Add success.
I0319 17:13:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:13:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:13:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:13:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:13:53.409808 543705 memory.go:184] no items to output this cycle
I0319 17:13:53.409822 543705 cpu.go:275] no items to output this cycle
E0319 17:14:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:14:03.409785 543705 memory.go:184] no items to output this cycle
I0319 17:14:03.409790 543705 cpu.go:275] no items to output this cycle
E0319 17:14:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:14:13.409773 543705 memory.go:191] Add success.
W0319 17:14:13.409799 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 17:14:13.409805 543705 cpu.go:282] Add success.
W0319 17:14:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:14:13.409813 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:14:13.420061 543705 net.go:648] Add success.
I0319 17:14:13.422940 543705 net.go:770] primary dev: ETH0
I0319 17:14:13.422952 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:14:13.422963 543705 net.go:698] Add success.
I0319 17:14:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:14:14.455131 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:14:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0319 17:14:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:14:14.456817 543705 disk_worker.go:494] system disk:vda1
I0319 17:14:14.456847 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:14:15.456014 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:14:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:14:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:14:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:14:16.472419 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:14:20.161678 543705 disk_info.go:125] begin check local disk info of client
I0319 17:14:20.164221 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:14:20.164228 543705 disk_info.go:196] parse disk info done, disk is : [0xc000395080 0xc0003950c0]
E0319 17:14:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:14:23.409804 543705 memory.go:184] no items to output this cycle
I0319 17:14:23.409816 543705 cpu.go:275] no items to output this cycle
E0319 17:14:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:14:33.409797 543705 memory.go:184] no items to output this cycle
I0319 17:14:33.409811 543705 cpu.go:275] no items to output this cycle
E0319 17:14:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:14:43.409784 543705 memory.go:191] Add success.
I0319 17:14:43.409862 543705 cpu.go:282] Add success.
I0319 17:14:43.420066 543705 net.go:648] Add success.
I0319 17:14:43.422874 543705 net.go:770] primary dev: ETH0
I0319 17:14:43.422887 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:14:43.422900 543705 net.go:698] Add success.
I0319 17:14:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:14:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:14:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:14:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:14:53.409772 543705 memory.go:184] no items to output this cycle
I0319 17:14:53.409804 543705 cpu.go:275] no items to output this cycle
E0319 17:15:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:15:03.409779 543705 memory.go:184] no items to output this cycle
I0319 17:15:03.409785 543705 cpu.go:275] no items to output this cycle
E0319 17:15:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:15:13.409788 543705 memory.go:191] Add success.
I0319 17:15:13.409788 543705 cpu.go:282] Add success.
W0319 17:15:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:15:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:15:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:15:13.420129 543705 net.go:648] Add success.
I0319 17:15:13.422951 543705 net.go:770] primary dev: ETH0
I0319 17:15:13.422967 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:15:13.422982 543705 net.go:698] Add success.
I0319 17:15:13.469682 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a651fae5-919a-4cd9-8f43-85e395d472ce","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 17:15:13.469717 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 17:15:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:15:14.455200 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:15:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0319 17:15:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:15:14.459000 543705 disk_worker.go:494] system disk:vda1
I0319 17:15:14.459038 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:15:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:15:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:15:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:15:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:15:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:15:20.165676 543705 disk_info.go:125] begin check local disk info of client
I0319 17:15:20.168272 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:15:20.168279 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004b0300 0xc0004b0340]
E0319 17:15:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:15:23.409783 543705 memory.go:184] no items to output this cycle
I0319 17:15:23.409786 543705 cpu.go:275] no items to output this cycle
E0319 17:15:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:15:33.409799 543705 memory.go:184] no items to output this cycle
I0319 17:15:33.409814 543705 cpu.go:275] no items to output this cycle
I0319 17:15:37.781731 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 17:15:37.781737 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 17:15:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:15:43.410703 543705 memory.go:191] Add success.
I0319 17:15:43.409802 543705 cpu.go:282] Add success.
I0319 17:15:43.420420 543705 net.go:648] Add success.
I0319 17:15:43.423060 543705 net.go:770] primary dev: ETH0
I0319 17:15:43.423076 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:15:43.423090 543705 net.go:698] Add success.
I0319 17:15:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:15:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:15:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:15:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:15:53.409781 543705 memory.go:184] no items to output this cycle
I0319 17:15:53.409782 543705 cpu.go:275] no items to output this cycle
E0319 17:16:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:16:03.409785 543705 memory.go:184] no items to output this cycle
I0319 17:16:03.409784 543705 cpu.go:275] no items to output this cycle
E0319 17:16:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:16:13.409794 543705 memory.go:191] Add success.
I0319 17:16:13.409797 543705 cpu.go:282] Add success.
W0319 17:16:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:16:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:16:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:16:13.420382 543705 net.go:648] Add success.
I0319 17:16:13.423212 543705 net.go:770] primary dev: ETH0
I0319 17:16:13.423227 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:16:13.423241 543705 net.go:698] Add success.
I0319 17:16:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:16:14.455217 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:16:14.455229 543705 disk_worker.go:708] disk space is not compliant
W0319 17:16:14.455232 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:16:14.459201 543705 disk_worker.go:494] system disk:vda1
I0319 17:16:14.459230 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:16:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:16:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:16:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:16:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:16:16.472397 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:16:20.169685 543705 disk_info.go:125] begin check local disk info of client
I0319 17:16:20.172146 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:16:20.172154 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa180 0xc0001aa1c0]
E0319 17:16:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:16:23.409801 543705 memory.go:184] no items to output this cycle
I0319 17:16:23.409813 543705 cpu.go:275] no items to output this cycle
E0319 17:16:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:16:33.409783 543705 memory.go:184] no items to output this cycle
I0319 17:16:33.409787 543705 cpu.go:275] no items to output this cycle
E0319 17:16:43.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:16:43.409819 543705 cpu.go:282] Add success.
I0319 17:16:43.409829 543705 memory.go:191] Add success.
I0319 17:16:43.420067 543705 net.go:648] Add success.
I0319 17:16:43.422892 543705 net.go:770] primary dev: ETH0
I0319 17:16:43.422906 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:16:43.422920 543705 net.go:698] Add success.
I0319 17:16:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:16:46.458063 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:16:46.458091 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:16:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:16:53.409809 543705 memory.go:184] no items to output this cycle
I0319 17:16:53.409823 543705 cpu.go:275] no items to output this cycle
E0319 17:17:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:17:03.409769 543705 memory.go:184] no items to output this cycle
I0319 17:17:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 17:17:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:17:13.409796 543705 cpu.go:282] Add success.
I0319 17:17:13.409800 543705 memory.go:191] Add success.
W0319 17:17:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:17:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:17:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:17:13.420080 543705 net.go:648] Add success.
I0319 17:17:13.422817 543705 net.go:770] primary dev: ETH0
I0319 17:17:13.422831 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:17:13.422844 543705 net.go:698] Add success.
I0319 17:17:13.453505 543705 event_worker.go:152] Polling the log file for events...
W0319 17:17:14.455098 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:17:14.455157 543705 disk_worker.go:708] disk space is not compliant
W0319 17:17:14.455160 543705 disk_worker.go:728] disk inode is not compliant
E0319 17:17:14.456188 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 17:17:14.456196 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 17:17:14.456202 543705 custom_config.go:64] query custom config with name: gpu
I0319 17:17:14.457028 543705 disk_worker.go:494] system disk:vda1
I0319 17:17:14.457068 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 17:17:15.456792 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 17:17:15.456801 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:17:16.457953 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 17:17:16.457954 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 17:17:16.458008 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:17:16.458029 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:17:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:17:20.173677 543705 disk_info.go:125] begin check local disk info of client
I0319 17:17:20.176096 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:17:20.176102 543705 disk_info.go:196] parse disk info done, disk is : [0xc000462780 0xc0004627c0]
E0319 17:17:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:17:23.409767 543705 memory.go:184] no items to output this cycle
I0319 17:17:23.409794 543705 cpu.go:275] no items to output this cycle
E0319 17:17:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:17:33.409795 543705 memory.go:184] no items to output this cycle
I0319 17:17:33.409807 543705 cpu.go:275] no items to output this cycle
E0319 17:17:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:17:43.409786 543705 memory.go:191] Add success.
I0319 17:17:43.409826 543705 cpu.go:282] Add success.
I0319 17:17:43.419970 543705 net.go:648] Add success.
I0319 17:17:43.422705 543705 net.go:770] primary dev: ETH0
I0319 17:17:43.422719 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:17:43.422732 543705 net.go:698] Add success.
I0319 17:17:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:17:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:17:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:17:53.410500 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:17:53.410519 543705 memory.go:184] no items to output this cycle
I0319 17:17:53.410521 543705 cpu.go:275] no items to output this cycle
E0319 17:18:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:18:03.409805 543705 memory.go:184] no items to output this cycle
I0319 17:18:03.409813 543705 cpu.go:275] no items to output this cycle
E0319 17:18:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:18:13.409781 543705 memory.go:191] Add success.
W0319 17:18:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 17:18:13.409808 543705 cpu.go:282] Add success.
W0319 17:18:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:18:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:18:13.420263 543705 net.go:648] Add success.
I0319 17:18:13.423167 543705 net.go:770] primary dev: ETH0
I0319 17:18:13.423181 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:18:13.423196 543705 net.go:698] Add success.
I0319 17:18:13.469890 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2a03be92-c2b4-4e13-836c-76aa87292c9a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 17:18:13.469931 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 17:18:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:18:14.455296 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:18:14.455310 543705 disk_worker.go:708] disk space is not compliant
W0319 17:18:14.455315 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:18:14.456943 543705 disk_worker.go:494] system disk:vda1
I0319 17:18:14.456988 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:18:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:18:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:18:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:18:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:18:16.472373 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:18:20.177677 543705 disk_info.go:125] begin check local disk info of client
I0319 17:18:20.180185 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:18:20.180191 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b01c0 0xc0002b0200]
E0319 17:18:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:18:23.409790 543705 memory.go:184] no items to output this cycle
I0319 17:18:23.409796 543705 cpu.go:275] no items to output this cycle
E0319 17:18:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:18:33.409800 543705 memory.go:184] no items to output this cycle
I0319 17:18:33.409816 543705 cpu.go:275] no items to output this cycle
I0319 17:18:37.783611 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 17:18:37.783618 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 17:18:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:18:43.410664 543705 memory.go:191] Add success.
I0319 17:18:43.409826 543705 cpu.go:282] Add success.
I0319 17:18:43.420412 543705 net.go:648] Add success.
I0319 17:18:43.423028 543705 net.go:770] primary dev: ETH0
I0319 17:18:43.423041 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:18:43.423054 543705 net.go:698] Add success.
I0319 17:18:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:18:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:18:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:18:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:18:53.409786 543705 memory.go:184] no items to output this cycle
I0319 17:18:53.409788 543705 cpu.go:275] no items to output this cycle
E0319 17:19:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:19:03.409766 543705 memory.go:184] no items to output this cycle
I0319 17:19:03.409787 543705 cpu.go:275] no items to output this cycle
E0319 17:19:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:19:13.409812 543705 memory.go:191] Add success.
I0319 17:19:13.409815 543705 cpu.go:282] Add success.
W0319 17:19:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:19:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:19:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:19:13.420054 543705 net.go:648] Add success.
I0319 17:19:13.423187 543705 net.go:770] primary dev: ETH0
I0319 17:19:13.423200 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:19:13.423212 543705 net.go:698] Add success.
I0319 17:19:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:19:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:19:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0319 17:19:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:19:14.457084 543705 disk_worker.go:494] system disk:vda1
I0319 17:19:14.457119 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:19:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:19:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:19:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:19:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:19:16.472453 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:19:20.181678 543705 disk_info.go:125] begin check local disk info of client
I0319 17:19:20.184276 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:19:20.184283 543705 disk_info.go:196] parse disk info done, disk is : [0xc00052a180 0xc00052a1c0]
E0319 17:19:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:19:23.409771 543705 memory.go:184] no items to output this cycle
I0319 17:19:23.409789 543705 cpu.go:275] no items to output this cycle
E0319 17:19:33.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:19:33.409761 543705 memory.go:184] no items to output this cycle
I0319 17:19:33.409794 543705 cpu.go:275] no items to output this cycle
E0319 17:19:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:19:43.409807 543705 memory.go:191] Add success.
I0319 17:19:43.409815 543705 cpu.go:282] Add success.
I0319 17:19:43.419868 543705 net.go:648] Add success.
I0319 17:19:43.422917 543705 net.go:770] primary dev: ETH0
I0319 17:19:43.422931 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:19:43.422946 543705 net.go:698] Add success.
I0319 17:19:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:19:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:19:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:19:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:19:53.409808 543705 memory.go:184] no items to output this cycle
I0319 17:19:53.409829 543705 cpu.go:275] no items to output this cycle
E0319 17:20:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:20:03.409783 543705 memory.go:184] no items to output this cycle
I0319 17:20:03.409782 543705 cpu.go:275] no items to output this cycle
E0319 17:20:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:20:13.409796 543705 memory.go:191] Add success.
I0319 17:20:13.409796 543705 cpu.go:282] Add success.
W0319 17:20:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:20:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:20:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:20:13.420173 543705 net.go:648] Add success.
I0319 17:20:13.423063 543705 net.go:770] primary dev: ETH0
I0319 17:20:13.423075 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:20:13.423088 543705 net.go:698] Add success.
I0319 17:20:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:20:14.455133 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:20:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0319 17:20:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:20:14.456573 543705 disk_worker.go:494] system disk:vda1
I0319 17:20:14.456617 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:20:15.456016 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:20:16.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:20:16.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:20:16.458082 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:20:16.472514 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:20:20.185680 543705 disk_info.go:125] begin check local disk info of client
I0319 17:20:20.188237 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:20:20.188243 543705 disk_info.go:196] parse disk info done, disk is : [0xc000352080 0xc0003520c0]
E0319 17:20:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:20:23.409773 543705 memory.go:184] no items to output this cycle
I0319 17:20:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 17:20:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:20:33.409767 543705 memory.go:184] no items to output this cycle
I0319 17:20:33.409800 543705 cpu.go:275] no items to output this cycle
E0319 17:20:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:20:43.409795 543705 memory.go:191] Add success.
I0319 17:20:43.409796 543705 cpu.go:282] Add success.
I0319 17:20:43.419964 543705 net.go:648] Add success.
I0319 17:20:43.422717 543705 net.go:770] primary dev: ETH0
I0319 17:20:43.422732 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:20:43.422746 543705 net.go:698] Add success.
I0319 17:20:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:20:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:20:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:20:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:20:53.409769 543705 memory.go:184] no items to output this cycle
I0319 17:20:53.409802 543705 cpu.go:275] no items to output this cycle
E0319 17:21:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:21:03.409785 543705 memory.go:184] no items to output this cycle
I0319 17:21:03.409791 543705 cpu.go:275] no items to output this cycle
E0319 17:21:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:21:13.409807 543705 memory.go:191] Add success.
I0319 17:21:13.409807 543705 cpu.go:282] Add success.
W0319 17:21:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:21:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:21:13.409851 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:21:13.420208 543705 net.go:648] Add success.
I0319 17:21:13.422891 543705 net.go:770] primary dev: ETH0
I0319 17:21:13.422904 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:21:13.422917 543705 net.go:698] Add success.
I0319 17:21:13.464060 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a7fb0844-98c4-4295-be83-b1218642bf0a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 17:21:13.464093 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 17:21:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:21:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:21:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0319 17:21:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:21:14.456622 543705 disk_worker.go:494] system disk:vda1
I0319 17:21:14.456651 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:21:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:21:16.458005 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:21:16.458070 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:21:16.458099 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:21:16.472492 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:21:20.189685 543705 disk_info.go:125] begin check local disk info of client
I0319 17:21:20.192234 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:21:20.192241 543705 disk_info.go:196] parse disk info done, disk is : [0xc000272980 0xc0002729c0]
E0319 17:21:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:21:23.409786 543705 memory.go:184] no items to output this cycle
I0319 17:21:23.409816 543705 cpu.go:275] no items to output this cycle
E0319 17:21:33.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:21:33.409823 543705 memory.go:184] no items to output this cycle
I0319 17:21:33.409841 543705 cpu.go:275] no items to output this cycle
I0319 17:21:37.784619 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 17:21:37.784626 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 17:21:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:21:43.410639 543705 memory.go:191] Add success.
I0319 17:21:43.409814 543705 cpu.go:282] Add success.
I0319 17:21:43.420563 543705 net.go:648] Add success.
I0319 17:21:43.423107 543705 net.go:770] primary dev: ETH0
I0319 17:21:43.423120 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:21:43.423133 543705 net.go:698] Add success.
I0319 17:21:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:21:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:21:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:21:53.409802 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:21:53.409824 543705 memory.go:184] no items to output this cycle
I0319 17:21:53.409835 543705 cpu.go:275] no items to output this cycle
E0319 17:22:03.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:22:03.409813 543705 memory.go:184] no items to output this cycle
I0319 17:22:03.409829 543705 cpu.go:275] no items to output this cycle
E0319 17:22:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:22:13.409794 543705 memory.go:191] Add success.
I0319 17:22:13.409812 543705 cpu.go:282] Add success.
W0319 17:22:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:22:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:22:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:22:13.420060 543705 net.go:648] Add success.
I0319 17:22:13.423036 543705 net.go:770] primary dev: ETH0
I0319 17:22:13.423049 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:22:13.423062 543705 net.go:698] Add success.
W0319 17:22:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:22:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0319 17:22:14.455187 543705 disk_worker.go:728] disk inode is not compliant
E0319 17:22:14.456136 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 17:22:14.456146 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 17:22:14.456152 543705 custom_config.go:64] query custom config with name: gpu
I0319 17:22:14.456457 543705 disk_worker.go:494] system disk:vda1
I0319 17:22:14.456487 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 17:22:15.457038 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 17:22:15.457051 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:22:16.457961 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 17:22:16.457961 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 17:22:16.458017 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:22:16.458039 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:22:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:22:20.193680 543705 disk_info.go:125] begin check local disk info of client
I0319 17:22:20.196248 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:22:20.196256 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004ec500 0xc0004ec540]
E0319 17:22:23.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:22:23.409812 543705 memory.go:184] no items to output this cycle
I0319 17:22:23.409825 543705 cpu.go:275] no items to output this cycle
E0319 17:22:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:22:33.409798 543705 cpu.go:275] no items to output this cycle
I0319 17:22:33.409801 543705 memory.go:184] no items to output this cycle
E0319 17:22:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:22:43.409822 543705 memory.go:191] Add success.
I0319 17:22:43.409830 543705 cpu.go:282] Add success.
I0319 17:22:43.419953 543705 net.go:648] Add success.
I0319 17:22:43.422708 543705 net.go:770] primary dev: ETH0
I0319 17:22:43.422721 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:22:43.422734 543705 net.go:698] Add success.
I0319 17:22:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:22:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:22:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:22:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:22:53.409774 543705 memory.go:184] no items to output this cycle
I0319 17:22:53.409801 543705 cpu.go:275] no items to output this cycle
E0319 17:23:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:23:03.409796 543705 memory.go:184] no items to output this cycle
I0319 17:23:03.409810 543705 cpu.go:275] no items to output this cycle
E0319 17:23:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:23:13.409782 543705 memory.go:191] Add success.
I0319 17:23:13.409804 543705 cpu.go:282] Add success.
W0319 17:23:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:23:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:23:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:23:13.420124 543705 net.go:648] Add success.
I0319 17:23:13.423073 543705 net.go:770] primary dev: ETH0
I0319 17:23:13.423087 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:23:13.423102 543705 net.go:698] Add success.
I0319 17:23:14.454983 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:23:14.455213 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:23:14.455226 543705 disk_worker.go:708] disk space is not compliant
W0319 17:23:14.455230 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:23:14.456627 543705 disk_worker.go:494] system disk:vda1
I0319 17:23:14.456659 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:23:15.455949 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:23:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:23:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:23:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:23:16.472374 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:23:20.197678 543705 disk_info.go:125] begin check local disk info of client
I0319 17:23:20.200256 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:23:20.200263 543705 disk_info.go:196] parse disk info done, disk is : [0xc000296080 0xc0002960c0]
E0319 17:23:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:23:23.409783 543705 cpu.go:275] no items to output this cycle
I0319 17:23:23.409792 543705 memory.go:184] no items to output this cycle
E0319 17:23:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:23:33.409782 543705 memory.go:184] no items to output this cycle
I0319 17:23:33.409784 543705 cpu.go:275] no items to output this cycle
E0319 17:23:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:23:43.409796 543705 memory.go:191] Add success.
I0319 17:23:43.409798 543705 cpu.go:282] Add success.
I0319 17:23:43.419872 543705 net.go:648] Add success.
I0319 17:23:43.422358 543705 net.go:770] primary dev: ETH0
I0319 17:23:43.422370 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:23:43.422382 543705 net.go:698] Add success.
I0319 17:23:46.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:23:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:23:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:23:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:23:53.409785 543705 memory.go:184] no items to output this cycle
I0319 17:23:53.409787 543705 cpu.go:275] no items to output this cycle
E0319 17:24:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:24:03.409778 543705 memory.go:184] no items to output this cycle
I0319 17:24:03.409780 543705 cpu.go:275] no items to output this cycle
E0319 17:24:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:24:13.409801 543705 memory.go:191] Add success.
I0319 17:24:13.409800 543705 cpu.go:282] Add success.
W0319 17:24:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:24:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:24:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:24:13.420152 543705 net.go:648] Add success.
I0319 17:24:13.423051 543705 net.go:770] primary dev: ETH0
I0319 17:24:13.423063 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:24:13.423076 543705 net.go:698] Add success.
I0319 17:24:13.469294 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0e148f46-d143-4418-8a91-0ec92d630db3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 17:24:13.469331 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 17:24:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:24:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:24:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0319 17:24:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:24:14.456586 543705 disk_worker.go:494] system disk:vda1
I0319 17:24:14.456614 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:24:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:24:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:24:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:24:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:24:16.472459 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:24:20.201678 543705 disk_info.go:125] begin check local disk info of client
I0319 17:24:20.204253 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:24:20.204259 543705 disk_info.go:196] parse disk info done, disk is : [0xc000466100 0xc000466140]
E0319 17:24:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:24:23.409774 543705 memory.go:184] no items to output this cycle
I0319 17:24:23.409795 543705 cpu.go:275] no items to output this cycle
E0319 17:24:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:24:33.409800 543705 memory.go:184] no items to output this cycle
I0319 17:24:33.409809 543705 cpu.go:275] no items to output this cycle
I0319 17:24:37.784773 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 17:24:37.784779 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 17:24:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:24:43.410962 543705 memory.go:191] Add success.
I0319 17:24:43.409825 543705 cpu.go:282] Add success.
I0319 17:24:43.419703 543705 net.go:648] Add success.
I0319 17:24:43.422570 543705 net.go:770] primary dev: ETH0
I0319 17:24:43.422585 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:24:43.422599 543705 net.go:698] Add success.
I0319 17:24:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:24:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:24:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:24:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:24:53.409778 543705 memory.go:184] no items to output this cycle
I0319 17:24:53.409781 543705 cpu.go:275] no items to output this cycle
E0319 17:25:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:25:03.409771 543705 memory.go:184] no items to output this cycle
I0319 17:25:03.409791 543705 cpu.go:275] no items to output this cycle
E0319 17:25:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:25:13.409793 543705 memory.go:191] Add success.
I0319 17:25:13.409798 543705 cpu.go:282] Add success.
W0319 17:25:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:25:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:25:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:25:13.420148 543705 net.go:648] Add success.
I0319 17:25:13.422806 543705 net.go:770] primary dev: ETH0
I0319 17:25:13.422821 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:25:13.422835 543705 net.go:698] Add success.
I0319 17:25:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:25:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:25:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0319 17:25:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:25:14.456515 543705 disk_worker.go:494] system disk:vda1
I0319 17:25:14.456558 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:25:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:25:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:25:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:25:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:25:16.472107 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:25:20.206784 543705 disk_info.go:125] begin check local disk info of client
I0319 17:25:20.209400 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:25:20.209406 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4180 0xc0000c41c0]
E0319 17:25:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:25:23.409779 543705 memory.go:184] no items to output this cycle
I0319 17:25:23.409781 543705 cpu.go:275] no items to output this cycle
I0319 17:25:33.409776 543705 cpu.go:275] no items to output this cycle
E0319 17:25:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:25:33.409791 543705 memory.go:184] no items to output this cycle
E0319 17:25:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:25:43.409816 543705 memory.go:191] Add success.
I0319 17:25:43.409826 543705 cpu.go:282] Add success.
I0319 17:25:43.420019 543705 net.go:648] Add success.
I0319 17:25:43.422661 543705 net.go:770] primary dev: ETH0
I0319 17:25:43.422674 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:25:43.422686 543705 net.go:698] Add success.
I0319 17:25:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:25:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:25:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:25:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:25:53.409784 543705 memory.go:184] no items to output this cycle
I0319 17:25:53.409784 543705 cpu.go:275] no items to output this cycle
E0319 17:26:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:26:03.409775 543705 memory.go:184] no items to output this cycle
I0319 17:26:03.409789 543705 cpu.go:275] no items to output this cycle
E0319 17:26:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:26:13.409814 543705 memory.go:191] Add success.
I0319 17:26:13.409823 543705 cpu.go:282] Add success.
W0319 17:26:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:26:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:26:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:26:13.420136 543705 net.go:648] Add success.
I0319 17:26:13.423043 543705 net.go:770] primary dev: ETH0
I0319 17:26:13.423058 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:26:13.423074 543705 net.go:698] Add success.
I0319 17:26:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:26:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:26:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0319 17:26:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:26:14.456619 543705 disk_worker.go:494] system disk:vda1
I0319 17:26:14.456651 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:26:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:26:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:26:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:26:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:26:16.472407 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:26:20.209672 543705 disk_info.go:125] begin check local disk info of client
I0319 17:26:20.212293 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:26:20.212300 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032a600 0xc00032a640]
E0319 17:26:23.409867 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:26:23.409908 543705 memory.go:184] no items to output this cycle
I0319 17:26:23.409989 543705 cpu.go:275] no items to output this cycle
E0319 17:26:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:26:33.409802 543705 memory.go:184] no items to output this cycle
I0319 17:26:33.409810 543705 cpu.go:275] no items to output this cycle
E0319 17:26:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:26:43.409814 543705 memory.go:191] Add success.
I0319 17:26:43.409820 543705 cpu.go:282] Add success.
I0319 17:26:43.419955 543705 net.go:648] Add success.
I0319 17:26:43.422873 543705 net.go:770] primary dev: ETH0
I0319 17:26:43.422888 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:26:43.422903 543705 net.go:698] Add success.
I0319 17:26:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:26:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:26:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:26:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:26:53.409781 543705 memory.go:184] no items to output this cycle
I0319 17:26:53.409798 543705 cpu.go:275] no items to output this cycle
E0319 17:27:03.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:27:03.409815 543705 memory.go:184] no items to output this cycle
I0319 17:27:03.409825 543705 cpu.go:275] no items to output this cycle
E0319 17:27:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:27:13.409788 543705 memory.go:191] Add success.
I0319 17:27:13.409813 543705 cpu.go:282] Add success.
W0319 17:27:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:27:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:27:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:27:13.420151 543705 net.go:648] Add success.
I0319 17:27:13.423063 543705 net.go:770] primary dev: ETH0
I0319 17:27:13.423078 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:27:13.423093 543705 net.go:698] Add success.
I0319 17:27:13.429697 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 17:27:13.452813 543705 event_worker.go:152] Polling the log file for events...
I0319 17:27:13.463581 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f74adf20-6192-4be6-bb41-82951dceb9da","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 17:27:13.463616 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 17:27:14.455154 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:27:14.455164 543705 disk_worker.go:708] disk space is not compliant
W0319 17:27:14.455167 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:27:14.456917 543705 disk_worker.go:494] system disk:vda1
E0319 17:27:14.456916 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 17:27:14.456926 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 17:27:14.456931 543705 custom_config.go:64] query custom config with name: gpu
I0319 17:27:14.456951 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 17:27:15.456784 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 17:27:15.456803 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:27:16.457919 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 17:27:16.457919 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 17:27:16.457976 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:27:16.457996 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:27:16.472309 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:27:20.213679 543705 disk_info.go:125] begin check local disk info of client
I0319 17:27:20.216236 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:27:20.216242 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001f4280 0xc0001f42c0]
E0319 17:27:23.410274 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:27:23.410294 543705 memory.go:184] no items to output this cycle
I0319 17:27:23.410297 543705 cpu.go:275] no items to output this cycle
E0319 17:27:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:27:33.409766 543705 memory.go:184] no items to output this cycle
I0319 17:27:33.409807 543705 cpu.go:275] no items to output this cycle
I0319 17:27:37.785733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 17:27:37.785739 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 17:27:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:27:43.410761 543705 memory.go:191] Add success.
I0319 17:27:43.409821 543705 cpu.go:282] Add success.
I0319 17:27:43.420543 543705 net.go:648] Add success.
I0319 17:27:43.424425 543705 net.go:770] primary dev: ETH0
I0319 17:27:43.424439 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:27:43.424454 543705 net.go:698] Add success.
I0319 17:27:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:27:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:27:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:27:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:27:53.409771 543705 memory.go:184] no items to output this cycle
I0319 17:27:53.409793 543705 cpu.go:275] no items to output this cycle
E0319 17:28:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:28:03.409794 543705 memory.go:184] no items to output this cycle
I0319 17:28:03.409806 543705 cpu.go:275] no items to output this cycle
E0319 17:28:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:28:13.409790 543705 memory.go:191] Add success.
I0319 17:28:13.409811 543705 cpu.go:282] Add success.
W0319 17:28:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:28:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:28:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:28:13.420170 543705 net.go:648] Add success.
I0319 17:28:13.423199 543705 net.go:770] primary dev: ETH0
I0319 17:28:13.423213 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:28:13.423226 543705 net.go:698] Add success.
I0319 17:28:14.454980 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:28:14.455166 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:28:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0319 17:28:14.455180 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:28:14.456504 543705 disk_worker.go:494] system disk:vda1
I0319 17:28:14.456552 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:28:15.456017 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:28:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:28:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:28:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:28:16.472430 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:28:20.217684 543705 disk_info.go:125] begin check local disk info of client
I0319 17:28:20.220260 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:28:20.220267 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039e100 0xc00039e140]
E0319 17:28:23.410710 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:28:23.410726 543705 memory.go:184] no items to output this cycle
I0319 17:28:23.410729 543705 cpu.go:275] no items to output this cycle
E0319 17:28:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:28:33.409775 543705 memory.go:184] no items to output this cycle
I0319 17:28:33.409799 543705 cpu.go:275] no items to output this cycle
E0319 17:28:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:28:43.409794 543705 memory.go:191] Add success.
I0319 17:28:43.409798 543705 cpu.go:282] Add success.
I0319 17:28:43.420003 543705 net.go:648] Add success.
I0319 17:28:43.422669 543705 net.go:770] primary dev: ETH0
I0319 17:28:43.422683 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:28:43.422698 543705 net.go:698] Add success.
I0319 17:28:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:28:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:28:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:28:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:28:53.409807 543705 memory.go:184] no items to output this cycle
I0319 17:28:53.409818 543705 cpu.go:275] no items to output this cycle
E0319 17:29:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:29:03.409773 543705 memory.go:184] no items to output this cycle
I0319 17:29:03.409797 543705 cpu.go:275] no items to output this cycle
E0319 17:29:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:29:13.409818 543705 memory.go:191] Add success.
I0319 17:29:13.409825 543705 cpu.go:282] Add success.
W0319 17:29:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:29:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:29:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:29:13.420415 543705 net.go:648] Add success.
I0319 17:29:13.423433 543705 net.go:770] primary dev: ETH0
I0319 17:29:13.423446 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:29:13.423458 543705 net.go:698] Add success.
I0319 17:29:14.454985 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:29:14.455142 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:29:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0319 17:29:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:29:14.456600 543705 disk_worker.go:494] system disk:vda1
I0319 17:29:14.456632 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:29:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:29:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:29:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:29:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:29:16.472388 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:29:20.221671 543705 disk_info.go:125] begin check local disk info of client
I0319 17:29:20.224326 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:29:20.224333 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000f1040 0xc0000f1080]
E0319 17:29:23.409859 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:29:23.409880 543705 memory.go:184] no items to output this cycle
I0319 17:29:23.409899 543705 cpu.go:275] no items to output this cycle
E0319 17:29:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:29:33.409778 543705 memory.go:184] no items to output this cycle
I0319 17:29:33.409787 543705 cpu.go:275] no items to output this cycle
E0319 17:29:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:29:43.409786 543705 memory.go:191] Add success.
I0319 17:29:43.409787 543705 cpu.go:282] Add success.
I0319 17:29:43.419888 543705 net.go:648] Add success.
I0319 17:29:43.422440 543705 net.go:770] primary dev: ETH0
I0319 17:29:43.422453 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:29:43.422465 543705 net.go:698] Add success.
I0319 17:29:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:29:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:29:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:29:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:29:53.409782 543705 memory.go:184] no items to output this cycle
I0319 17:29:53.409784 543705 cpu.go:275] no items to output this cycle
E0319 17:30:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:30:03.409782 543705 memory.go:184] no items to output this cycle
I0319 17:30:03.409791 543705 cpu.go:275] no items to output this cycle
E0319 17:30:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:30:13.409792 543705 memory.go:191] Add success.
W0319 17:30:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 17:30:13.409824 543705 cpu.go:282] Add success.
W0319 17:30:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:30:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:30:13.420165 543705 net.go:648] Add success.
I0319 17:30:13.423092 543705 net.go:770] primary dev: ETH0
I0319 17:30:13.423107 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:30:13.423122 543705 net.go:698] Add success.
I0319 17:30:13.469918 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"097d2b83-ac18-40d1-af06-bd21f3334f38","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 17:30:13.469952 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 17:30:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:30:14.455194 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:30:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0319 17:30:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:30:14.456602 543705 disk_worker.go:494] system disk:vda1
I0319 17:30:14.456635 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:30:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:30:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:30:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:30:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:30:16.472455 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:30:20.225676 543705 disk_info.go:125] begin check local disk info of client
I0319 17:30:20.228253 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:30:20.228260 543705 disk_info.go:196] parse disk info done, disk is : [0xc000475980 0xc0004759c0]
E0319 17:30:23.409848 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:30:23.409884 543705 memory.go:184] no items to output this cycle
I0319 17:30:23.410036 543705 cpu.go:275] no items to output this cycle
E0319 17:30:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:30:33.409785 543705 memory.go:184] no items to output this cycle
I0319 17:30:33.409800 543705 cpu.go:275] no items to output this cycle
I0319 17:30:37.785874 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 17:30:37.785881 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 17:30:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:30:43.410650 543705 memory.go:191] Add success.
I0319 17:30:43.409821 543705 cpu.go:282] Add success.
I0319 17:30:43.420438 543705 net.go:648] Add success.
I0319 17:30:43.423282 543705 net.go:770] primary dev: ETH0
I0319 17:30:43.423296 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:30:43.423309 543705 net.go:698] Add success.
I0319 17:30:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:30:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:30:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:30:53.410237 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:30:53.410254 543705 memory.go:184] no items to output this cycle
I0319 17:30:53.410275 543705 cpu.go:275] no items to output this cycle
E0319 17:31:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:31:03.409804 543705 memory.go:184] no items to output this cycle
I0319 17:31:03.409819 543705 cpu.go:275] no items to output this cycle
E0319 17:31:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:31:13.409791 543705 memory.go:191] Add success.
I0319 17:31:13.409813 543705 cpu.go:282] Add success.
W0319 17:31:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:31:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:31:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:31:13.420253 543705 net.go:648] Add success.
I0319 17:31:13.423364 543705 net.go:770] primary dev: ETH0
I0319 17:31:13.423377 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:31:13.423389 543705 net.go:698] Add success.
I0319 17:31:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:31:14.455167 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:31:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0319 17:31:14.455182 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:31:14.456515 543705 disk_worker.go:494] system disk:vda1
I0319 17:31:14.456559 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:31:15.456011 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:31:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:31:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:31:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:31:16.472460 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:31:20.229682 543705 disk_info.go:125] begin check local disk info of client
I0319 17:31:20.232254 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:31:20.232261 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007a740 0xc00007a780]
E0319 17:31:23.410408 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:31:23.410423 543705 cpu.go:275] no items to output this cycle
I0319 17:31:23.410425 543705 memory.go:184] no items to output this cycle
E0319 17:31:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:31:33.409792 543705 memory.go:184] no items to output this cycle
I0319 17:31:33.409806 543705 cpu.go:275] no items to output this cycle
E0319 17:31:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:31:43.409814 543705 memory.go:191] Add success.
I0319 17:31:43.409823 543705 cpu.go:282] Add success.
I0319 17:31:43.419954 543705 net.go:648] Add success.
I0319 17:31:43.422852 543705 net.go:770] primary dev: ETH0
I0319 17:31:43.422867 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:31:43.422881 543705 net.go:698] Add success.
I0319 17:31:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:31:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:31:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:31:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:31:53.409781 543705 cpu.go:275] no items to output this cycle
I0319 17:31:53.409784 543705 memory.go:184] no items to output this cycle
E0319 17:32:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:32:03.409799 543705 memory.go:184] no items to output this cycle
I0319 17:32:03.409816 543705 cpu.go:275] no items to output this cycle
E0319 17:32:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:32:13.409783 543705 memory.go:191] Add success.
I0319 17:32:13.409804 543705 cpu.go:282] Add success.
W0319 17:32:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:32:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:32:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:32:13.420270 543705 net.go:648] Add success.
I0319 17:32:13.423323 543705 net.go:770] primary dev: ETH0
I0319 17:32:13.423337 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:32:13.423352 543705 net.go:698] Add success.
W0319 17:32:14.455105 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:32:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0319 17:32:14.455170 543705 disk_worker.go:728] disk inode is not compliant
E0319 17:32:14.456131 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 17:32:14.456141 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 17:32:14.456147 543705 custom_config.go:64] query custom config with name: gpu
I0319 17:32:14.456462 543705 disk_worker.go:494] system disk:vda1
I0319 17:32:14.456556 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 17:32:15.456800 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 17:32:15.456808 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:32:16.457939 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 17:32:16.457939 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 17:32:16.457993 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:32:16.458012 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:32:16.472373 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:32:20.233676 543705 disk_info.go:125] begin check local disk info of client
I0319 17:32:20.236317 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:32:20.236324 543705 disk_info.go:196] parse disk info done, disk is : [0xc000492e40 0xc000492e80]
E0319 17:32:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:32:23.409795 543705 memory.go:184] no items to output this cycle
I0319 17:32:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 17:32:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:32:33.409774 543705 memory.go:184] no items to output this cycle
I0319 17:32:33.409800 543705 cpu.go:275] no items to output this cycle
E0319 17:32:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:32:43.409822 543705 memory.go:191] Add success.
I0319 17:32:43.409823 543705 cpu.go:282] Add success.
I0319 17:32:43.420019 543705 net.go:648] Add success.
I0319 17:32:43.423092 543705 net.go:770] primary dev: ETH0
I0319 17:32:43.423106 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:32:43.423118 543705 net.go:698] Add success.
I0319 17:32:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:32:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:32:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:32:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:32:53.409772 543705 memory.go:184] no items to output this cycle
I0319 17:32:53.409820 543705 cpu.go:275] no items to output this cycle
E0319 17:33:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:33:03.409795 543705 memory.go:184] no items to output this cycle
I0319 17:33:03.409809 543705 cpu.go:275] no items to output this cycle
E0319 17:33:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:33:13.409788 543705 memory.go:191] Add success.
I0319 17:33:13.409805 543705 cpu.go:282] Add success.
W0319 17:33:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:33:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:33:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:33:13.420098 543705 net.go:648] Add success.
I0319 17:33:13.422802 543705 net.go:770] primary dev: ETH0
I0319 17:33:13.422815 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:33:13.422827 543705 net.go:698] Add success.
I0319 17:33:13.469582 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"33c42cab-3cb6-4aff-8c32-2b43496041e4","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 17:33:13.469620 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 17:33:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:33:14.455344 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:33:14.455358 543705 disk_worker.go:708] disk space is not compliant
W0319 17:33:14.455361 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:33:14.457494 543705 disk_worker.go:494] system disk:vda1
I0319 17:33:14.457536 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:33:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:33:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:33:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:33:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:33:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:33:20.237678 543705 disk_info.go:125] begin check local disk info of client
I0319 17:33:20.240221 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:33:20.240228 543705 disk_info.go:196] parse disk info done, disk is : [0xc000395000 0xc000395040]
E0319 17:33:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:33:23.409797 543705 memory.go:184] no items to output this cycle
I0319 17:33:23.409811 543705 cpu.go:275] no items to output this cycle
E0319 17:33:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:33:33.409779 543705 memory.go:184] no items to output this cycle
I0319 17:33:33.409788 543705 cpu.go:275] no items to output this cycle
I0319 17:33:37.786022 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 17:33:37.786028 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 17:33:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:33:43.410677 543705 memory.go:191] Add success.
I0319 17:33:43.409827 543705 cpu.go:282] Add success.
I0319 17:33:43.420464 543705 net.go:648] Add success.
I0319 17:33:43.423322 543705 net.go:770] primary dev: ETH0
I0319 17:33:43.423336 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:33:43.423361 543705 net.go:698] Add success.
I0319 17:33:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:33:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:33:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:33:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:33:53.409778 543705 memory.go:184] no items to output this cycle
I0319 17:33:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 17:34:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:34:03.409804 543705 memory.go:184] no items to output this cycle
I0319 17:34:03.409819 543705 cpu.go:275] no items to output this cycle
E0319 17:34:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:34:13.409783 543705 memory.go:191] Add success.
I0319 17:34:13.409808 543705 cpu.go:282] Add success.
W0319 17:34:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:34:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:34:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:34:13.420088 543705 net.go:648] Add success.
I0319 17:34:13.422882 543705 net.go:770] primary dev: ETH0
I0319 17:34:13.422894 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:34:13.422912 543705 net.go:698] Add success.
I0319 17:34:14.454955 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:34:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:34:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0319 17:34:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:34:14.456576 543705 disk_worker.go:494] system disk:vda1
I0319 17:34:14.456603 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:34:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:34:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:34:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:34:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:34:16.472518 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:34:20.241687 543705 disk_info.go:125] begin check local disk info of client
I0319 17:34:20.244249 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:34:20.244256 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003562c0 0xc000356300]
E0319 17:34:23.410255 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:34:23.410271 543705 memory.go:184] no items to output this cycle
I0319 17:34:23.410274 543705 cpu.go:275] no items to output this cycle
E0319 17:34:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:34:33.409803 543705 memory.go:184] no items to output this cycle
I0319 17:34:33.409816 543705 cpu.go:275] no items to output this cycle
E0319 17:34:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:34:43.409812 543705 memory.go:191] Add success.
I0319 17:34:43.409825 543705 cpu.go:282] Add success.
I0319 17:34:43.419950 543705 net.go:648] Add success.
I0319 17:34:43.422881 543705 net.go:770] primary dev: ETH0
I0319 17:34:43.422894 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:34:43.422907 543705 net.go:698] Add success.
I0319 17:34:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:34:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:34:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:34:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:34:53.409799 543705 memory.go:184] no items to output this cycle
I0319 17:34:53.409806 543705 cpu.go:275] no items to output this cycle
E0319 17:35:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:35:03.409775 543705 memory.go:184] no items to output this cycle
I0319 17:35:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 17:35:13.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:35:13.409835 543705 memory.go:191] Add success.
I0319 17:35:13.409847 543705 cpu.go:282] Add success.
W0319 17:35:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:35:13.409891 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:35:13.409896 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:35:13.420327 543705 net.go:648] Add success.
I0319 17:35:13.422947 543705 net.go:770] primary dev: ETH0
I0319 17:35:13.422961 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:35:13.422973 543705 net.go:698] Add success.
I0319 17:35:14.453954 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:35:14.455211 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:35:14.455220 543705 disk_worker.go:708] disk space is not compliant
W0319 17:35:14.455223 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:35:14.456560 543705 disk_worker.go:494] system disk:vda1
I0319 17:35:14.456603 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:35:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:35:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:35:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:35:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:35:16.472386 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:35:20.245680 543705 disk_info.go:125] begin check local disk info of client
I0319 17:35:20.248270 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:35:20.248276 543705 disk_info.go:196] parse disk info done, disk is : [0xc000278300 0xc000278340]
E0319 17:35:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:35:23.409797 543705 memory.go:184] no items to output this cycle
I0319 17:35:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 17:35:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:35:33.409803 543705 memory.go:184] no items to output this cycle
I0319 17:35:33.409814 543705 cpu.go:275] no items to output this cycle
E0319 17:35:43.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:35:43.409773 543705 memory.go:191] Add success.
I0319 17:35:43.409805 543705 cpu.go:282] Add success.
I0319 17:35:43.419866 543705 net.go:648] Add success.
I0319 17:35:43.422771 543705 net.go:770] primary dev: ETH0
I0319 17:35:43.422786 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:35:43.422801 543705 net.go:698] Add success.
I0319 17:35:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:35:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:35:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:35:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:35:53.409779 543705 memory.go:184] no items to output this cycle
I0319 17:35:53.409796 543705 cpu.go:275] no items to output this cycle
E0319 17:36:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:36:03.409776 543705 memory.go:184] no items to output this cycle
I0319 17:36:03.409804 543705 cpu.go:275] no items to output this cycle
E0319 17:36:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:36:13.409790 543705 cpu.go:282] Add success.
I0319 17:36:13.409795 543705 memory.go:191] Add success.
W0319 17:36:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:36:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:36:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:36:13.420211 543705 net.go:648] Add success.
I0319 17:36:13.422858 543705 net.go:770] primary dev: ETH0
I0319 17:36:13.422873 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:36:13.422888 543705 net.go:698] Add success.
I0319 17:36:13.469641 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"fdbc3cc3-3ad4-4d3c-aa91-cfa0478a935c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 17:36:13.469692 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 17:36:14.454949 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:36:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:36:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 17:36:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:36:14.456680 543705 disk_worker.go:494] system disk:vda1
I0319 17:36:14.456714 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:36:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:36:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:36:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:36:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:36:16.472455 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:36:20.249679 543705 disk_info.go:125] begin check local disk info of client
I0319 17:36:20.252271 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:36:20.252278 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003904c0 0xc000390980]
E0319 17:36:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:36:23.409783 543705 memory.go:184] no items to output this cycle
I0319 17:36:23.409786 543705 cpu.go:275] no items to output this cycle
E0319 17:36:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:36:33.409782 543705 memory.go:184] no items to output this cycle
I0319 17:36:33.409790 543705 cpu.go:275] no items to output this cycle
I0319 17:36:37.787637 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 17:36:37.787643 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 17:36:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:36:43.410659 543705 memory.go:191] Add success.
I0319 17:36:43.409800 543705 cpu.go:282] Add success.
I0319 17:36:43.420354 543705 net.go:648] Add success.
I0319 17:36:43.422929 543705 net.go:770] primary dev: ETH0
I0319 17:36:43.422942 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:36:43.422954 543705 net.go:698] Add success.
I0319 17:36:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:36:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:36:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:36:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:36:53.409775 543705 memory.go:184] no items to output this cycle
I0319 17:36:53.409797 543705 cpu.go:275] no items to output this cycle
E0319 17:37:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:37:03.409788 543705 memory.go:184] no items to output this cycle
I0319 17:37:03.409797 543705 cpu.go:275] no items to output this cycle
E0319 17:37:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:37:13.409800 543705 memory.go:191] Add success.
I0319 17:37:13.409804 543705 cpu.go:282] Add success.
W0319 17:37:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:37:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:37:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:37:13.420310 543705 net.go:648] Add success.
I0319 17:37:13.423089 543705 net.go:770] primary dev: ETH0
I0319 17:37:13.423103 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:37:13.423115 543705 net.go:698] Add success.
I0319 17:37:13.453673 543705 event_worker.go:152] Polling the log file for events...
W0319 17:37:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:37:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0319 17:37:14.455186 543705 disk_worker.go:728] disk inode is not compliant
E0319 17:37:14.458426 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 17:37:14.458462 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 17:37:14.458467 543705 custom_config.go:64] query custom config with name: gpu
I0319 17:37:14.458455 543705 disk_worker.go:494] system disk:vda1
I0319 17:37:14.458500 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 17:37:15.456839 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 17:37:15.456849 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:37:16.457928 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 17:37:16.457927 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 17:37:16.457983 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:37:16.458002 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:37:16.472329 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:37:20.253689 543705 disk_info.go:125] begin check local disk info of client
I0319 17:37:20.256231 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:37:20.256238 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047e180 0xc00047e1c0]
E0319 17:37:23.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:37:23.409809 543705 memory.go:184] no items to output this cycle
I0319 17:37:23.409823 543705 cpu.go:275] no items to output this cycle
E0319 17:37:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:37:33.409806 543705 memory.go:184] no items to output this cycle
I0319 17:37:33.409820 543705 cpu.go:275] no items to output this cycle
E0319 17:37:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:37:43.409799 543705 memory.go:191] Add success.
I0319 17:37:43.409821 543705 cpu.go:282] Add success.
I0319 17:37:43.419962 543705 net.go:648] Add success.
I0319 17:37:43.422701 543705 net.go:770] primary dev: ETH0
I0319 17:37:43.422716 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:37:43.422730 543705 net.go:698] Add success.
I0319 17:37:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:37:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:37:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:37:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:37:53.409807 543705 memory.go:184] no items to output this cycle
I0319 17:37:53.409817 543705 cpu.go:275] no items to output this cycle
E0319 17:38:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:38:03.409795 543705 memory.go:184] no items to output this cycle
I0319 17:38:03.409803 543705 cpu.go:275] no items to output this cycle
E0319 17:38:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:38:13.409829 543705 memory.go:191] Add success.
I0319 17:38:13.409838 543705 cpu.go:282] Add success.
W0319 17:38:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:38:13.409877 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:38:13.409881 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:38:13.420221 543705 net.go:648] Add success.
I0319 17:38:13.423162 543705 net.go:770] primary dev: ETH0
I0319 17:38:13.423175 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:38:13.423189 543705 net.go:698] Add success.
I0319 17:38:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:38:14.455134 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:38:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0319 17:38:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:38:14.456589 543705 disk_worker.go:494] system disk:vda1
I0319 17:38:14.456622 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:38:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:38:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:38:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:38:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:38:16.472385 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:38:20.257680 543705 disk_info.go:125] begin check local disk info of client
I0319 17:38:20.260235 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:38:20.260242 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e0100 0xc0003e0140]
E0319 17:38:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:38:23.409778 543705 memory.go:184] no items to output this cycle
I0319 17:38:23.409820 543705 cpu.go:275] no items to output this cycle
E0319 17:38:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:38:33.409801 543705 memory.go:184] no items to output this cycle
I0319 17:38:33.409809 543705 cpu.go:275] no items to output this cycle
E0319 17:38:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:38:43.409808 543705 memory.go:191] Add success.
I0319 17:38:43.409810 543705 cpu.go:282] Add success.
I0319 17:38:43.419958 543705 net.go:648] Add success.
I0319 17:38:43.422893 543705 net.go:770] primary dev: ETH0
I0319 17:38:43.422906 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:38:43.422918 543705 net.go:698] Add success.
I0319 17:38:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:38:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:38:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:38:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:38:53.409785 543705 memory.go:184] no items to output this cycle
I0319 17:38:53.409788 543705 cpu.go:275] no items to output this cycle
E0319 17:39:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:39:03.409773 543705 memory.go:184] no items to output this cycle
I0319 17:39:03.409794 543705 cpu.go:275] no items to output this cycle
E0319 17:39:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:39:13.409829 543705 memory.go:191] Add success.
I0319 17:39:13.409836 543705 cpu.go:282] Add success.
W0319 17:39:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:39:13.409876 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:39:13.409880 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:39:13.420189 543705 net.go:648] Add success.
I0319 17:39:13.423236 543705 net.go:770] primary dev: ETH0
I0319 17:39:13.423248 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:39:13.423262 543705 net.go:698] Add success.
I0319 17:39:13.468126 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"54c8413e-b96c-40cd-8e5f-953146984956","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 17:39:13.468158 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 17:39:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:39:14.455160 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:39:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0319 17:39:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:39:14.456527 543705 disk_worker.go:494] system disk:vda1
I0319 17:39:14.456583 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:39:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:39:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:39:16.458025 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:39:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:39:16.472370 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:39:20.261684 543705 disk_info.go:125] begin check local disk info of client
I0319 17:39:20.264211 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:39:20.264218 543705 disk_info.go:196] parse disk info done, disk is : [0xc000474000 0xc000474040]
E0319 17:39:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:39:23.409766 543705 memory.go:184] no items to output this cycle
I0319 17:39:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 17:39:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:39:33.409769 543705 memory.go:184] no items to output this cycle
I0319 17:39:33.409795 543705 cpu.go:275] no items to output this cycle
I0319 17:39:37.788639 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 17:39:37.788646 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 17:39:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:39:43.410828 543705 memory.go:191] Add success.
I0319 17:39:43.409828 543705 cpu.go:282] Add success.
I0319 17:39:43.420538 543705 net.go:648] Add success.
I0319 17:39:43.423333 543705 net.go:770] primary dev: ETH0
I0319 17:39:43.423345 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:39:43.423359 543705 net.go:698] Add success.
I0319 17:39:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:39:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:39:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:39:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:39:53.409784 543705 memory.go:184] no items to output this cycle
I0319 17:39:53.409791 543705 cpu.go:275] no items to output this cycle
E0319 17:40:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:40:03.409776 543705 memory.go:184] no items to output this cycle
I0319 17:40:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 17:40:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:40:13.409786 543705 memory.go:191] Add success.
I0319 17:40:13.409803 543705 cpu.go:282] Add success.
W0319 17:40:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:40:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:40:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:40:13.420153 543705 net.go:648] Add success.
I0319 17:40:13.422986 543705 net.go:770] primary dev: ETH0
I0319 17:40:13.423000 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:40:13.423013 543705 net.go:698] Add success.
I0319 17:40:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:40:14.455138 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:40:14.455148 543705 disk_worker.go:708] disk space is not compliant
W0319 17:40:14.455151 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:40:14.456469 543705 disk_worker.go:494] system disk:vda1
I0319 17:40:14.456515 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:40:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:40:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:40:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:40:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:40:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:40:20.265673 543705 disk_info.go:125] begin check local disk info of client
I0319 17:40:20.268214 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:40:20.268220 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b7900 0xc0003b7940]
E0319 17:40:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:40:23.409873 543705 memory.go:184] no items to output this cycle
I0319 17:40:23.409979 543705 cpu.go:275] no items to output this cycle
E0319 17:40:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:40:33.409775 543705 memory.go:184] no items to output this cycle
I0319 17:40:33.409787 543705 cpu.go:275] no items to output this cycle
E0319 17:40:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:40:43.409808 543705 cpu.go:282] Add success.
I0319 17:40:43.409817 543705 memory.go:191] Add success.
I0319 17:40:43.419982 543705 net.go:648] Add success.
I0319 17:40:43.422821 543705 net.go:770] primary dev: ETH0
I0319 17:40:43.422833 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:40:43.422845 543705 net.go:698] Add success.
I0319 17:40:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:40:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:40:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:40:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:40:53.409778 543705 memory.go:184] no items to output this cycle
I0319 17:40:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 17:41:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:41:03.409779 543705 memory.go:184] no items to output this cycle
I0319 17:41:03.409797 543705 cpu.go:275] no items to output this cycle
E0319 17:41:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:41:13.409809 543705 memory.go:191] Add success.
I0319 17:41:13.409815 543705 cpu.go:282] Add success.
W0319 17:41:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:41:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:41:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:41:13.420223 543705 net.go:648] Add success.
I0319 17:41:13.423146 543705 net.go:770] primary dev: ETH0
I0319 17:41:13.423159 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:41:13.423171 543705 net.go:698] Add success.
I0319 17:41:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:41:14.455158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:41:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0319 17:41:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:41:14.456551 543705 disk_worker.go:494] system disk:vda1
I0319 17:41:14.456597 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:41:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:41:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:41:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:41:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:41:16.472420 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:41:20.269678 543705 disk_info.go:125] begin check local disk info of client
I0319 17:41:20.272272 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:41:20.272279 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003beb80 0xc0003bebc0]
E0319 17:41:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:41:23.409779 543705 memory.go:184] no items to output this cycle
I0319 17:41:23.409779 543705 cpu.go:275] no items to output this cycle
E0319 17:41:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:41:33.409806 543705 memory.go:184] no items to output this cycle
I0319 17:41:33.409819 543705 cpu.go:275] no items to output this cycle
E0319 17:41:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:41:43.409787 543705 memory.go:191] Add success.
I0319 17:41:43.409803 543705 cpu.go:282] Add success.
I0319 17:41:43.419881 543705 net.go:648] Add success.
I0319 17:41:43.422755 543705 net.go:770] primary dev: ETH0
I0319 17:41:43.422769 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:41:43.422781 543705 net.go:698] Add success.
I0319 17:41:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:41:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:41:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:41:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:41:53.409799 543705 memory.go:184] no items to output this cycle
I0319 17:41:53.409807 543705 cpu.go:275] no items to output this cycle
E0319 17:42:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:42:03.409787 543705 memory.go:184] no items to output this cycle
I0319 17:42:03.409792 543705 cpu.go:275] no items to output this cycle
E0319 17:42:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:42:13.409784 543705 cpu.go:282] Add success.
I0319 17:42:13.409791 543705 memory.go:191] Add success.
W0319 17:42:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:42:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:42:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:42:13.420072 543705 net.go:648] Add success.
I0319 17:42:13.422777 543705 net.go:770] primary dev: ETH0
I0319 17:42:13.422792 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:42:13.422805 543705 net.go:698] Add success.
I0319 17:42:13.468983 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"cc245613-d879-4656-823d-3aebb3aa737c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 17:42:13.469018 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 17:42:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:42:14.455246 543705 disk_worker.go:708] disk space is not compliant
W0319 17:42:14.455251 543705 disk_worker.go:728] disk inode is not compliant
E0319 17:42:14.456093 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 17:42:14.456102 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 17:42:14.456107 543705 custom_config.go:64] query custom config with name: gpu
I0319 17:42:14.457045 543705 disk_worker.go:494] system disk:vda1
I0319 17:42:14.457077 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 17:42:15.456839 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 17:42:15.456850 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:42:16.457924 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 17:42:16.457924 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 17:42:16.457982 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:42:16.458001 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:42:16.472333 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:42:20.273677 543705 disk_info.go:125] begin check local disk info of client
I0319 17:42:20.276294 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:42:20.276301 543705 disk_info.go:196] parse disk info done, disk is : [0xc000396fc0 0xc000397000]
E0319 17:42:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:42:23.409765 543705 memory.go:184] no items to output this cycle
I0319 17:42:23.409809 543705 cpu.go:275] no items to output this cycle
E0319 17:42:33.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:42:33.409761 543705 memory.go:184] no items to output this cycle
I0319 17:42:33.409791 543705 cpu.go:275] no items to output this cycle
I0319 17:42:37.789672 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 17:42:37.789679 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 17:42:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:42:43.410758 543705 memory.go:191] Add success.
I0319 17:42:43.409788 543705 cpu.go:282] Add success.
I0319 17:42:43.420533 543705 net.go:648] Add success.
I0319 17:42:43.423436 543705 net.go:770] primary dev: ETH0
I0319 17:42:43.423451 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:42:43.423465 543705 net.go:698] Add success.
I0319 17:42:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:42:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:42:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:42:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:42:53.409772 543705 memory.go:184] no items to output this cycle
I0319 17:42:53.409792 543705 cpu.go:275] no items to output this cycle
E0319 17:43:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:43:03.409776 543705 memory.go:184] no items to output this cycle
I0319 17:43:03.409796 543705 cpu.go:275] no items to output this cycle
E0319 17:43:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:43:13.409781 543705 memory.go:191] Add success.
W0319 17:43:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 17:43:13.409809 543705 cpu.go:282] Add success.
W0319 17:43:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:43:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:43:13.420145 543705 net.go:648] Add success.
I0319 17:43:13.422908 543705 net.go:770] primary dev: ETH0
I0319 17:43:13.422921 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:43:13.422933 543705 net.go:698] Add success.
I0319 17:43:14.454982 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:43:14.455111 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:43:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0319 17:43:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:43:14.456531 543705 disk_worker.go:494] system disk:vda1
I0319 17:43:14.456581 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:43:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:43:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:43:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:43:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:43:16.472410 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:43:20.277681 543705 disk_info.go:125] begin check local disk info of client
I0319 17:43:20.280267 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:43:20.280274 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004686c0 0xc000468700]
E0319 17:43:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:43:23.409798 543705 memory.go:184] no items to output this cycle
I0319 17:43:23.409812 543705 cpu.go:275] no items to output this cycle
E0319 17:43:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:43:33.409786 543705 memory.go:184] no items to output this cycle
I0319 17:43:33.409787 543705 cpu.go:275] no items to output this cycle
E0319 17:43:43.409909 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:43:43.409949 543705 memory.go:191] Add success.
I0319 17:43:43.410204 543705 cpu.go:282] Add success.
I0319 17:43:43.419708 543705 net.go:648] Add success.
I0319 17:43:43.422563 543705 net.go:770] primary dev: ETH0
I0319 17:43:43.422576 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:43:43.422588 543705 net.go:698] Add success.
I0319 17:43:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:43:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:43:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:43:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:43:53.409781 543705 memory.go:184] no items to output this cycle
I0319 17:43:53.409813 543705 cpu.go:275] no items to output this cycle
E0319 17:44:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:44:03.409774 543705 memory.go:184] no items to output this cycle
I0319 17:44:03.409804 543705 cpu.go:275] no items to output this cycle
E0319 17:44:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:44:13.409811 543705 memory.go:191] Add success.
I0319 17:44:13.409819 543705 cpu.go:282] Add success.
W0319 17:44:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:44:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:44:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:44:13.420199 543705 net.go:648] Add success.
I0319 17:44:13.422902 543705 net.go:770] primary dev: ETH0
I0319 17:44:13.422916 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:44:13.422929 543705 net.go:698] Add success.
I0319 17:44:14.454980 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:44:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:44:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0319 17:44:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:44:14.456517 543705 disk_worker.go:494] system disk:vda1
I0319 17:44:14.456562 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:44:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:44:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:44:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:44:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:44:16.472374 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:44:20.281677 543705 disk_info.go:125] begin check local disk info of client
I0319 17:44:20.284262 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:44:20.284270 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b66c0 0xc0003b6700]
E0319 17:44:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:44:23.409760 543705 memory.go:184] no items to output this cycle
I0319 17:44:23.409791 543705 cpu.go:275] no items to output this cycle
E0319 17:44:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:44:33.409766 543705 memory.go:184] no items to output this cycle
I0319 17:44:33.409788 543705 cpu.go:275] no items to output this cycle
E0319 17:44:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:44:43.409793 543705 memory.go:191] Add success.
I0319 17:44:43.409811 543705 cpu.go:282] Add success.
I0319 17:44:43.419998 543705 net.go:648] Add success.
I0319 17:44:43.422696 543705 net.go:770] primary dev: ETH0
I0319 17:44:43.422710 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:44:43.422723 543705 net.go:698] Add success.
I0319 17:44:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:44:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:44:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:44:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:44:53.409787 543705 memory.go:184] no items to output this cycle
I0319 17:44:53.409790 543705 cpu.go:275] no items to output this cycle
E0319 17:45:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:45:03.409795 543705 memory.go:184] no items to output this cycle
I0319 17:45:03.409805 543705 cpu.go:275] no items to output this cycle
E0319 17:45:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:45:13.409775 543705 memory.go:191] Add success.
W0319 17:45:13.409801 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 17:45:13.409806 543705 cpu.go:282] Add success.
W0319 17:45:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:45:13.409816 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:45:13.420136 543705 net.go:648] Add success.
I0319 17:45:13.423342 543705 net.go:770] primary dev: ETH0
I0319 17:45:13.423355 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:45:13.423367 543705 net.go:698] Add success.
I0319 17:45:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:45:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:45:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0319 17:45:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:45:14.456563 543705 disk_worker.go:494] system disk:vda1
I0319 17:45:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:45:14.605854 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"66ff2508-6f47-4817-8ce7-ff2c34ae1a3e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 17:45:14.605889 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 17:45:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:45:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:45:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:45:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:45:16.472386 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:45:20.285678 543705 disk_info.go:125] begin check local disk info of client
I0319 17:45:20.288332 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:45:20.288339 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034a7c0 0xc00034a800]
E0319 17:45:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:45:23.409768 543705 memory.go:184] no items to output this cycle
I0319 17:45:23.409800 543705 cpu.go:275] no items to output this cycle
I0319 17:45:33.409905 543705 cpu.go:275] no items to output this cycle
E0319 17:45:33.409922 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:45:33.409996 543705 memory.go:184] no items to output this cycle
I0319 17:45:37.789813 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 17:45:37.789819 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 17:45:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:45:43.410813 543705 memory.go:191] Add success.
I0319 17:45:43.409819 543705 cpu.go:282] Add success.
I0319 17:45:43.420603 543705 net.go:648] Add success.
I0319 17:45:43.423632 543705 net.go:770] primary dev: ETH0
I0319 17:45:43.423644 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:45:43.423656 543705 net.go:698] Add success.
I0319 17:45:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:45:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:45:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:45:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:45:53.409775 543705 memory.go:184] no items to output this cycle
I0319 17:45:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 17:46:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:46:03.409778 543705 memory.go:184] no items to output this cycle
I0319 17:46:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 17:46:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:46:13.409791 543705 memory.go:191] Add success.
I0319 17:46:13.409800 543705 cpu.go:282] Add success.
W0319 17:46:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:46:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:46:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:46:13.420128 543705 net.go:648] Add success.
I0319 17:46:13.422855 543705 net.go:770] primary dev: ETH0
I0319 17:46:13.422870 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:46:13.422884 543705 net.go:698] Add success.
I0319 17:46:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:46:14.455167 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:46:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0319 17:46:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:46:14.456566 543705 disk_worker.go:494] system disk:vda1
I0319 17:46:14.456600 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:46:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:46:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:46:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:46:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:46:16.472425 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:46:20.289680 543705 disk_info.go:125] begin check local disk info of client
I0319 17:46:20.292288 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:46:20.292294 543705 disk_info.go:196] parse disk info done, disk is : [0xc0005e8f00 0xc0005e8f40]
E0319 17:46:23.410349 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:46:23.410369 543705 memory.go:184] no items to output this cycle
I0319 17:46:23.410398 543705 cpu.go:275] no items to output this cycle
E0319 17:46:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:46:33.409768 543705 memory.go:184] no items to output this cycle
I0319 17:46:33.409809 543705 cpu.go:275] no items to output this cycle
E0319 17:46:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:46:43.409796 543705 memory.go:191] Add success.
I0319 17:46:43.409796 543705 cpu.go:282] Add success.
I0319 17:46:43.419982 543705 net.go:648] Add success.
I0319 17:46:43.422589 543705 net.go:770] primary dev: ETH0
I0319 17:46:43.422603 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:46:43.422615 543705 net.go:698] Add success.
I0319 17:46:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:46:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:46:46.458091 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:46:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:46:53.409773 543705 memory.go:184] no items to output this cycle
I0319 17:46:53.409804 543705 cpu.go:275] no items to output this cycle
E0319 17:47:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:47:03.409808 543705 memory.go:184] no items to output this cycle
I0319 17:47:03.409819 543705 cpu.go:275] no items to output this cycle
E0319 17:47:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:47:13.409814 543705 memory.go:191] Add success.
I0319 17:47:13.409823 543705 cpu.go:282] Add success.
W0319 17:47:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:47:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:47:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:47:13.420162 543705 net.go:648] Add success.
I0319 17:47:13.422879 543705 net.go:770] primary dev: ETH0
I0319 17:47:13.422892 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:47:13.422904 543705 net.go:698] Add success.
I0319 17:47:13.453472 543705 event_worker.go:152] Polling the log file for events...
W0319 17:47:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:47:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0319 17:47:14.455168 543705 disk_worker.go:728] disk inode is not compliant
E0319 17:47:14.455877 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 17:47:14.455885 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 17:47:14.455891 543705 custom_config.go:64] query custom config with name: gpu
I0319 17:47:14.456545 543705 disk_worker.go:494] system disk:vda1
I0319 17:47:14.456576 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 17:47:15.456874 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 17:47:15.456883 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:47:16.457932 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 17:47:16.457932 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 17:47:16.457987 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:47:16.458005 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:47:16.472359 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:47:20.293678 543705 disk_info.go:125] begin check local disk info of client
I0319 17:47:20.296278 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:47:20.296285 543705 disk_info.go:196] parse disk info done, disk is : [0xc00029ebc0 0xc00029ec00]
E0319 17:47:23.410237 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:47:23.410255 543705 memory.go:184] no items to output this cycle
I0319 17:47:23.410259 543705 cpu.go:275] no items to output this cycle
E0319 17:47:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:47:33.409781 543705 memory.go:184] no items to output this cycle
I0319 17:47:33.409783 543705 cpu.go:275] no items to output this cycle
E0319 17:47:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:47:43.409785 543705 memory.go:191] Add success.
I0319 17:47:43.409820 543705 cpu.go:282] Add success.
I0319 17:47:43.419966 543705 net.go:648] Add success.
I0319 17:47:43.422956 543705 net.go:770] primary dev: ETH0
I0319 17:47:43.422969 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:47:43.422981 543705 net.go:698] Add success.
I0319 17:47:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:47:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:47:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:47:53.410355 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:47:53.410374 543705 memory.go:184] no items to output this cycle
I0319 17:47:53.410391 543705 cpu.go:275] no items to output this cycle
E0319 17:48:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:48:03.409795 543705 memory.go:184] no items to output this cycle
I0319 17:48:03.409805 543705 cpu.go:275] no items to output this cycle
E0319 17:48:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:48:13.409787 543705 memory.go:191] Add success.
I0319 17:48:13.409813 543705 cpu.go:282] Add success.
W0319 17:48:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:48:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:48:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:48:13.420156 543705 net.go:648] Add success.
I0319 17:48:13.423170 543705 net.go:770] primary dev: ETH0
I0319 17:48:13.423184 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:48:13.423196 543705 net.go:698] Add success.
I0319 17:48:14.455132 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:48:14.455222 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:48:14.455233 543705 disk_worker.go:708] disk space is not compliant
W0319 17:48:14.455236 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:48:14.456666 543705 disk_worker.go:494] system disk:vda1
I0319 17:48:14.456701 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:48:14.643391 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ab3dd75e-c69e-49bd-88c2-e9849a5354d3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 17:48:14.643426 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 17:48:15.454982 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:48:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:48:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:48:16.458049 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:48:16.472398 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:48:20.297681 543705 disk_info.go:125] begin check local disk info of client
I0319 17:48:20.300211 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:48:20.300218 543705 disk_info.go:196] parse disk info done, disk is : [0xc000340a80 0xc000340ac0]
E0319 17:48:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:48:23.409805 543705 memory.go:184] no items to output this cycle
I0319 17:48:23.409819 543705 cpu.go:275] no items to output this cycle
E0319 17:48:33.409893 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:48:33.409916 543705 memory.go:184] no items to output this cycle
I0319 17:48:33.409921 543705 cpu.go:275] no items to output this cycle
I0319 17:48:37.789955 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 17:48:37.789961 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 17:48:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:48:43.410749 543705 memory.go:191] Add success.
I0319 17:48:43.409808 543705 cpu.go:282] Add success.
I0319 17:48:43.420439 543705 net.go:648] Add success.
I0319 17:48:43.423180 543705 net.go:770] primary dev: ETH0
I0319 17:48:43.423196 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:48:43.423211 543705 net.go:698] Add success.
I0319 17:48:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:48:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:48:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:48:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:48:53.409792 543705 memory.go:184] no items to output this cycle
I0319 17:48:53.409796 543705 cpu.go:275] no items to output this cycle
E0319 17:49:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:49:03.409771 543705 memory.go:184] no items to output this cycle
I0319 17:49:03.409802 543705 cpu.go:275] no items to output this cycle
E0319 17:49:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:49:13.409786 543705 memory.go:191] Add success.
I0319 17:49:13.409809 543705 cpu.go:282] Add success.
W0319 17:49:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:49:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:49:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:49:13.420241 543705 net.go:648] Add success.
I0319 17:49:13.423157 543705 net.go:770] primary dev: ETH0
I0319 17:49:13.423170 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:49:13.423181 543705 net.go:698] Add success.
I0319 17:49:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:49:14.455201 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:49:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0319 17:49:14.455214 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:49:14.456623 543705 disk_worker.go:494] system disk:vda1
I0319 17:49:14.456653 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:49:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:49:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:49:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:49:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:49:16.472386 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:49:20.301686 543705 disk_info.go:125] begin check local disk info of client
I0319 17:49:20.304243 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:49:20.304251 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a41c0 0xc0002a4200]
E0319 17:49:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:49:23.409806 543705 memory.go:184] no items to output this cycle
I0319 17:49:23.409816 543705 cpu.go:275] no items to output this cycle
E0319 17:49:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:49:33.409765 543705 memory.go:184] no items to output this cycle
I0319 17:49:33.409807 543705 cpu.go:275] no items to output this cycle
E0319 17:49:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:49:43.409814 543705 memory.go:191] Add success.
I0319 17:49:43.409819 543705 cpu.go:282] Add success.
I0319 17:49:43.419894 543705 net.go:648] Add success.
I0319 17:49:43.422664 543705 net.go:770] primary dev: ETH0
I0319 17:49:43.422678 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:49:43.422689 543705 net.go:698] Add success.
I0319 17:49:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:49:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:49:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:49:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:49:53.409785 543705 cpu.go:275] no items to output this cycle
I0319 17:49:53.409789 543705 memory.go:184] no items to output this cycle
E0319 17:50:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:50:03.409785 543705 memory.go:184] no items to output this cycle
I0319 17:50:03.409786 543705 cpu.go:275] no items to output this cycle
E0319 17:50:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:50:13.409813 543705 memory.go:191] Add success.
I0319 17:50:13.409822 543705 cpu.go:282] Add success.
W0319 17:50:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:50:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:50:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:50:13.420115 543705 net.go:648] Add success.
I0319 17:50:13.422984 543705 net.go:770] primary dev: ETH0
I0319 17:50:13.422996 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:50:13.423008 543705 net.go:698] Add success.
I0319 17:50:14.454982 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:50:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:50:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0319 17:50:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:50:14.456587 543705 disk_worker.go:494] system disk:vda1
I0319 17:50:14.456616 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:50:15.455981 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:50:16.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:50:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:50:16.458079 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:50:16.472090 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:50:20.305696 543705 disk_info.go:125] begin check local disk info of client
I0319 17:50:20.308301 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:50:20.308309 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ee000 0xc0003ee040]
E0319 17:50:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:50:23.409782 543705 memory.go:184] no items to output this cycle
I0319 17:50:23.409792 543705 cpu.go:275] no items to output this cycle
E0319 17:50:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:50:33.409791 543705 memory.go:184] no items to output this cycle
I0319 17:50:33.409804 543705 cpu.go:275] no items to output this cycle
E0319 17:50:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:50:43.409805 543705 memory.go:191] Add success.
I0319 17:50:43.409805 543705 cpu.go:282] Add success.
I0319 17:50:43.419953 543705 net.go:648] Add success.
I0319 17:50:43.423092 543705 net.go:770] primary dev: ETH0
I0319 17:50:43.423105 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:50:43.423118 543705 net.go:698] Add success.
I0319 17:50:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:50:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:50:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:50:53.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:50:53.409815 543705 memory.go:184] no items to output this cycle
I0319 17:50:53.409827 543705 cpu.go:275] no items to output this cycle
E0319 17:51:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:51:03.409792 543705 memory.go:184] no items to output this cycle
I0319 17:51:03.409794 543705 cpu.go:275] no items to output this cycle
E0319 17:51:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:51:13.409818 543705 memory.go:191] Add success.
I0319 17:51:13.409830 543705 cpu.go:282] Add success.
W0319 17:51:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:51:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:51:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:51:13.420118 543705 net.go:648] Add success.
I0319 17:51:13.422915 543705 net.go:770] primary dev: ETH0
I0319 17:51:13.422930 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:51:13.422944 543705 net.go:698] Add success.
I0319 17:51:13.469339 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a48e001b-2227-43e2-b734-f5027474855c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 17:51:13.469372 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 17:51:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:51:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:51:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0319 17:51:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:51:14.456509 543705 disk_worker.go:494] system disk:vda1
I0319 17:51:14.456554 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:51:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:51:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:51:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:51:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:51:16.472392 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:51:20.309684 543705 disk_info.go:125] begin check local disk info of client
I0319 17:51:20.312249 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:51:20.312256 543705 disk_info.go:196] parse disk info done, disk is : [0xc000482ac0 0xc000482b00]
E0319 17:51:23.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:51:23.409809 543705 memory.go:184] no items to output this cycle
I0319 17:51:23.409822 543705 cpu.go:275] no items to output this cycle
E0319 17:51:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:51:33.409775 543705 memory.go:184] no items to output this cycle
I0319 17:51:33.409815 543705 cpu.go:275] no items to output this cycle
I0319 17:51:37.790102 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 17:51:37.790109 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 17:51:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:51:43.410654 543705 memory.go:191] Add success.
I0319 17:51:43.409809 543705 cpu.go:282] Add success.
I0319 17:51:43.420372 543705 net.go:648] Add success.
I0319 17:51:43.423028 543705 net.go:770] primary dev: ETH0
I0319 17:51:43.423041 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:51:43.423054 543705 net.go:698] Add success.
I0319 17:51:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:51:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:51:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:51:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:51:53.409815 543705 memory.go:184] no items to output this cycle
I0319 17:51:53.409826 543705 cpu.go:275] no items to output this cycle
E0319 17:52:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:52:03.409797 543705 memory.go:184] no items to output this cycle
I0319 17:52:03.409822 543705 cpu.go:275] no items to output this cycle
E0319 17:52:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:52:13.409810 543705 memory.go:191] Add success.
I0319 17:52:13.409827 543705 cpu.go:282] Add success.
W0319 17:52:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:52:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:52:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:52:13.420128 543705 net.go:648] Add success.
I0319 17:52:13.423094 543705 net.go:770] primary dev: ETH0
I0319 17:52:13.423108 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:52:13.423124 543705 net.go:698] Add success.
W0319 17:52:14.455185 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:52:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0319 17:52:14.455197 543705 disk_worker.go:728] disk inode is not compliant
E0319 17:52:14.456432 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 17:52:14.456458 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 17:52:14.456465 543705 custom_config.go:64] query custom config with name: gpu
I0319 17:52:14.456856 543705 disk_worker.go:494] system disk:vda1
I0319 17:52:14.456900 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 17:52:15.456781 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 17:52:15.456789 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:52:16.457934 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 17:52:16.457934 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 17:52:16.457988 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:52:16.458007 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:52:16.472320 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:52:20.313681 543705 disk_info.go:125] begin check local disk info of client
I0319 17:52:20.316230 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:52:20.316237 543705 disk_info.go:196] parse disk info done, disk is : [0xc00029ef80 0xc00029efc0]
E0319 17:52:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:52:23.409781 543705 memory.go:184] no items to output this cycle
I0319 17:52:23.409784 543705 cpu.go:275] no items to output this cycle
E0319 17:52:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:52:33.409813 543705 memory.go:184] no items to output this cycle
I0319 17:52:33.409823 543705 cpu.go:275] no items to output this cycle
E0319 17:52:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:52:43.409776 543705 memory.go:191] Add success.
I0319 17:52:43.409807 543705 cpu.go:282] Add success.
I0319 17:52:43.419902 543705 net.go:648] Add success.
I0319 17:52:43.422855 543705 net.go:770] primary dev: ETH0
I0319 17:52:43.422880 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:52:43.422892 543705 net.go:698] Add success.
I0319 17:52:46.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:52:46.458068 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:52:46.458097 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:52:53.410246 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:52:53.410264 543705 memory.go:184] no items to output this cycle
I0319 17:52:53.410285 543705 cpu.go:275] no items to output this cycle
E0319 17:53:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:53:03.409799 543705 memory.go:184] no items to output this cycle
I0319 17:53:03.409810 543705 cpu.go:275] no items to output this cycle
E0319 17:53:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:53:13.409786 543705 memory.go:191] Add success.
W0319 17:53:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 17:53:13.409812 543705 cpu.go:282] Add success.
W0319 17:53:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:53:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:53:13.420103 543705 net.go:648] Add success.
I0319 17:53:13.422941 543705 net.go:770] primary dev: ETH0
I0319 17:53:13.422954 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:53:13.422966 543705 net.go:698] Add success.
W0319 17:53:14.455149 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:53:14.455248 543705 disk_worker.go:708] disk space is not compliant
W0319 17:53:14.455253 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:53:14.458136 543705 custom_config.go:64] query custom config with name: gpu
I0319 17:53:14.459122 543705 disk_worker.go:494] system disk:vda1
I0319 17:53:14.459165 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:53:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:53:16.457994 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:53:16.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:53:16.458086 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:53:16.472464 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:53:20.317678 543705 disk_info.go:125] begin check local disk info of client
I0319 17:53:20.320232 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:53:20.320239 543705 disk_info.go:196] parse disk info done, disk is : [0xc000291200 0xc000291240]
E0319 17:53:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:53:23.409795 543705 memory.go:184] no items to output this cycle
I0319 17:53:23.409809 543705 cpu.go:275] no items to output this cycle
E0319 17:53:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:53:33.409801 543705 memory.go:184] no items to output this cycle
I0319 17:53:33.409815 543705 cpu.go:275] no items to output this cycle
E0319 17:53:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:53:43.409815 543705 memory.go:191] Add success.
I0319 17:53:43.409823 543705 cpu.go:282] Add success.
I0319 17:53:43.419912 543705 net.go:648] Add success.
I0319 17:53:43.422571 543705 net.go:770] primary dev: ETH0
I0319 17:53:43.422586 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:53:43.422598 543705 net.go:698] Add success.
I0319 17:53:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:53:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:53:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:53:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:53:53.409800 543705 memory.go:184] no items to output this cycle
I0319 17:53:53.409813 543705 cpu.go:275] no items to output this cycle
E0319 17:54:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:54:03.409774 543705 memory.go:184] no items to output this cycle
I0319 17:54:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 17:54:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:54:13.409806 543705 memory.go:191] Add success.
I0319 17:54:13.409814 543705 cpu.go:282] Add success.
W0319 17:54:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:54:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:54:13.409856 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:54:13.420059 543705 net.go:648] Add success.
I0319 17:54:13.423156 543705 net.go:770] primary dev: ETH0
I0319 17:54:13.423170 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:54:13.423180 543705 net.go:698] Add success.
I0319 17:54:13.464574 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5dfad9a6-f07b-45f9-be49-fb1a56ac55cc","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 17:54:13.464610 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 17:54:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:54:14.455306 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:54:14.455320 543705 disk_worker.go:708] disk space is not compliant
W0319 17:54:14.455324 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:54:14.456950 543705 disk_worker.go:494] system disk:vda1
I0319 17:54:14.456992 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:54:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:54:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:54:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:54:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:54:16.472394 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:54:20.321679 543705 disk_info.go:125] begin check local disk info of client
I0319 17:54:20.324249 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:54:20.324255 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003fe1c0 0xc0003fe200]
E0319 17:54:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:54:23.409771 543705 memory.go:184] no items to output this cycle
I0319 17:54:23.409792 543705 cpu.go:275] no items to output this cycle
E0319 17:54:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:54:33.409773 543705 memory.go:184] no items to output this cycle
I0319 17:54:33.409788 543705 cpu.go:275] no items to output this cycle
I0319 17:54:37.790253 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 17:54:37.790260 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 17:54:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:54:43.410698 543705 memory.go:191] Add success.
I0319 17:54:43.409820 543705 cpu.go:282] Add success.
I0319 17:54:43.420465 543705 net.go:648] Add success.
I0319 17:54:43.423304 543705 net.go:770] primary dev: ETH0
I0319 17:54:43.423318 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:54:43.423330 543705 net.go:698] Add success.
I0319 17:54:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:54:46.458070 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:54:46.458099 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:54:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:54:53.409785 543705 memory.go:184] no items to output this cycle
I0319 17:54:53.409789 543705 cpu.go:275] no items to output this cycle
E0319 17:55:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:55:03.409778 543705 memory.go:184] no items to output this cycle
I0319 17:55:03.409780 543705 cpu.go:275] no items to output this cycle
E0319 17:55:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:55:13.409791 543705 memory.go:191] Add success.
I0319 17:55:13.409797 543705 cpu.go:282] Add success.
W0319 17:55:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:55:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:55:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:55:13.420332 543705 net.go:648] Add success.
I0319 17:55:13.423247 543705 net.go:770] primary dev: ETH0
I0319 17:55:13.423260 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:55:13.423271 543705 net.go:698] Add success.
I0319 17:55:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:55:14.455111 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:55:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 17:55:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:55:14.456555 543705 disk_worker.go:494] system disk:vda1
I0319 17:55:14.456583 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:55:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:55:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:55:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:55:16.458049 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:55:16.472392 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:55:20.325678 543705 disk_info.go:125] begin check local disk info of client
I0319 17:55:20.328223 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:55:20.328231 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b6280 0xc0003b6300]
E0319 17:55:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:55:23.409766 543705 memory.go:184] no items to output this cycle
I0319 17:55:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 17:55:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:55:33.409778 543705 memory.go:184] no items to output this cycle
I0319 17:55:33.409784 543705 cpu.go:275] no items to output this cycle
E0319 17:55:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:55:43.409799 543705 memory.go:191] Add success.
I0319 17:55:43.409799 543705 cpu.go:282] Add success.
I0319 17:55:43.420013 543705 net.go:648] Add success.
I0319 17:55:43.422781 543705 net.go:770] primary dev: ETH0
I0319 17:55:43.422795 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:55:43.422823 543705 net.go:698] Add success.
I0319 17:55:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:55:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:55:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:55:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:55:53.409792 543705 memory.go:184] no items to output this cycle
I0319 17:55:53.409796 543705 cpu.go:275] no items to output this cycle
E0319 17:56:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:56:03.409776 543705 memory.go:184] no items to output this cycle
I0319 17:56:03.409784 543705 cpu.go:275] no items to output this cycle
E0319 17:56:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:56:13.409776 543705 memory.go:191] Add success.
W0319 17:56:13.409803 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 17:56:13.409811 543705 cpu.go:282] Add success.
W0319 17:56:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:56:13.409817 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:56:13.420202 543705 net.go:648] Add success.
I0319 17:56:13.423021 543705 net.go:770] primary dev: ETH0
I0319 17:56:13.423033 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:56:13.423045 543705 net.go:698] Add success.
I0319 17:56:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:56:14.455099 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:56:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0319 17:56:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:56:14.456584 543705 disk_worker.go:494] system disk:vda1
I0319 17:56:14.456615 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:56:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:56:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:56:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:56:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:56:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:56:20.329681 543705 disk_info.go:125] begin check local disk info of client
I0319 17:56:20.332218 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:56:20.332226 543705 disk_info.go:196] parse disk info done, disk is : [0xc000344a40 0xc000344a80]
E0319 17:56:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:56:23.409802 543705 memory.go:184] no items to output this cycle
I0319 17:56:23.409812 543705 cpu.go:275] no items to output this cycle
E0319 17:56:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:56:33.409777 543705 memory.go:184] no items to output this cycle
I0319 17:56:33.409780 543705 cpu.go:275] no items to output this cycle
E0319 17:56:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:56:43.409781 543705 memory.go:191] Add success.
I0319 17:56:43.409815 543705 cpu.go:282] Add success.
I0319 17:56:43.419962 543705 net.go:648] Add success.
I0319 17:56:43.422622 543705 net.go:770] primary dev: ETH0
I0319 17:56:43.422636 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:56:43.422649 543705 net.go:698] Add success.
I0319 17:56:46.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:56:46.458065 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:56:46.458097 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:56:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:56:53.409772 543705 memory.go:184] no items to output this cycle
I0319 17:56:53.409798 543705 cpu.go:275] no items to output this cycle
E0319 17:57:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:57:03.409779 543705 memory.go:184] no items to output this cycle
I0319 17:57:03.409782 543705 cpu.go:275] no items to output this cycle
E0319 17:57:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:57:13.409790 543705 memory.go:191] Add success.
I0319 17:57:13.409792 543705 cpu.go:282] Add success.
W0319 17:57:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:57:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:57:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:57:13.420122 543705 net.go:648] Add success.
I0319 17:57:13.428804 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 17:57:13.428880 543705 net.go:770] primary dev: ETH0
I0319 17:57:13.428894 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:57:13.428908 543705 net.go:698] Add success.
I0319 17:57:13.452770 543705 event_worker.go:152] Polling the log file for events...
I0319 17:57:13.780639 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"66a8badf-6602-4a10-a759-10ecc915bb9e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 17:57:13.780670 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 17:57:14.454851 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:57:14.454919 543705 disk_worker.go:708] disk space is not compliant
W0319 17:57:14.454924 543705 disk_worker.go:728] disk inode is not compliant
E0319 17:57:14.455620 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 17:57:14.455629 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 17:57:14.455634 543705 custom_config.go:64] query custom config with name: gpu
I0319 17:57:14.456495 543705 disk_worker.go:494] system disk:vda1
I0319 17:57:14.456524 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 17:57:15.456813 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 17:57:15.456821 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:57:16.457915 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 17:57:16.457915 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 17:57:16.457970 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:57:16.457988 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:57:16.472424 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:57:20.333679 543705 disk_info.go:125] begin check local disk info of client
I0319 17:57:20.336234 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:57:20.336241 543705 disk_info.go:196] parse disk info done, disk is : [0xc0005204c0 0xc000520500]
E0319 17:57:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:57:23.409773 543705 memory.go:184] no items to output this cycle
I0319 17:57:23.409808 543705 cpu.go:275] no items to output this cycle
E0319 17:57:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:57:33.409769 543705 memory.go:184] no items to output this cycle
I0319 17:57:33.409806 543705 cpu.go:275] no items to output this cycle
I0319 17:57:37.791654 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 17:57:37.791660 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 17:57:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:57:43.410775 543705 memory.go:191] Add success.
I0319 17:57:43.409808 543705 cpu.go:282] Add success.
I0319 17:57:43.420489 543705 net.go:648] Add success.
I0319 17:57:43.423286 543705 net.go:770] primary dev: ETH0
I0319 17:57:43.423309 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:57:43.423321 543705 net.go:698] Add success.
I0319 17:57:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:57:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:57:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:57:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:57:53.409803 543705 memory.go:184] no items to output this cycle
I0319 17:57:53.409811 543705 cpu.go:275] no items to output this cycle
E0319 17:58:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:58:03.409779 543705 memory.go:184] no items to output this cycle
I0319 17:58:03.409784 543705 cpu.go:275] no items to output this cycle
E0319 17:58:13.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:58:13.409774 543705 memory.go:191] Add success.
W0319 17:58:13.409801 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 17:58:13.409808 543705 cpu.go:282] Add success.
W0319 17:58:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:58:13.409816 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:58:13.419724 543705 net.go:648] Add success.
I0319 17:58:13.422394 543705 net.go:770] primary dev: ETH0
I0319 17:58:13.422409 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:58:13.422422 543705 net.go:698] Add success.
I0319 17:58:14.454983 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:58:14.455193 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:58:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0319 17:58:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:58:14.456586 543705 disk_worker.go:494] system disk:vda1
I0319 17:58:14.456615 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:58:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:58:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:58:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:58:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:58:16.472418 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:58:20.337678 543705 disk_info.go:125] begin check local disk info of client
I0319 17:58:20.340218 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:58:20.340225 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a29c0 0xc0004a2a00]
E0319 17:58:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:58:23.409771 543705 memory.go:184] no items to output this cycle
I0319 17:58:23.409792 543705 cpu.go:275] no items to output this cycle
E0319 17:58:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:58:33.409800 543705 memory.go:184] no items to output this cycle
I0319 17:58:33.409816 543705 cpu.go:275] no items to output this cycle
E0319 17:58:43.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:58:43.409827 543705 memory.go:191] Add success.
I0319 17:58:43.409836 543705 cpu.go:282] Add success.
I0319 17:58:43.419967 543705 net.go:648] Add success.
I0319 17:58:43.422958 543705 net.go:770] primary dev: ETH0
I0319 17:58:43.422973 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:58:43.422986 543705 net.go:698] Add success.
I0319 17:58:46.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:58:46.458062 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:58:46.458090 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:58:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:58:53.409784 543705 cpu.go:275] no items to output this cycle
I0319 17:58:53.409786 543705 memory.go:184] no items to output this cycle
E0319 17:59:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:59:03.409780 543705 memory.go:184] no items to output this cycle
I0319 17:59:03.409794 543705 cpu.go:275] no items to output this cycle
E0319 17:59:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:59:13.409810 543705 memory.go:191] Add success.
I0319 17:59:13.409819 543705 cpu.go:282] Add success.
W0319 17:59:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 17:59:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 17:59:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 17:59:13.419726 543705 net.go:648] Add success.
I0319 17:59:13.422593 543705 net.go:770] primary dev: ETH0
I0319 17:59:13.422606 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:59:13.422617 543705 net.go:698] Add success.
I0319 17:59:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0319 17:59:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 17:59:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0319 17:59:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0319 17:59:14.456567 543705 disk_worker.go:494] system disk:vda1
I0319 17:59:14.456597 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 17:59:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 17:59:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:59:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:59:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0319 17:59:16.472368 543705 disk_local_worker.go:436] Get disk info: []
I0319 17:59:20.341680 543705 disk_info.go:125] begin check local disk info of client
I0319 17:59:20.344230 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 17:59:20.344237 543705 disk_info.go:196] parse disk info done, disk is : [0xc000578340 0xc000578380]
E0319 17:59:23.410227 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:59:23.410246 543705 memory.go:184] no items to output this cycle
I0319 17:59:23.410259 543705 cpu.go:275] no items to output this cycle
E0319 17:59:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:59:33.409778 543705 memory.go:184] no items to output this cycle
I0319 17:59:33.409779 543705 cpu.go:275] no items to output this cycle
E0319 17:59:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:59:43.409816 543705 memory.go:191] Add success.
I0319 17:59:43.409824 543705 cpu.go:282] Add success.
I0319 17:59:43.419863 543705 net.go:648] Add success.
I0319 17:59:43.422811 543705 net.go:770] primary dev: ETH0
I0319 17:59:43.422825 543705 net.go:802] Send network stats successfully!,count is 6
I0319 17:59:43.422838 543705 net.go:698] Add success.
I0319 17:59:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 17:59:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 17:59:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 17:59:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 17:59:53.409783 543705 memory.go:184] no items to output this cycle
I0319 17:59:53.409782 543705 cpu.go:275] no items to output this cycle
E0319 18:00:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:00:03.409782 543705 memory.go:184] no items to output this cycle
I0319 18:00:03.409783 543705 cpu.go:275] no items to output this cycle
E0319 18:00:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:00:13.409804 543705 memory.go:191] Add success.
I0319 18:00:13.409816 543705 cpu.go:282] Add success.
W0319 18:00:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:00:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:00:13.409855 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:00:13.420178 543705 net.go:648] Add success.
I0319 18:00:13.422922 543705 net.go:770] primary dev: ETH0
I0319 18:00:13.422935 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:00:13.422947 543705 net.go:698] Add success.
I0319 18:00:13.463677 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4e3b1341-04d8-4a2e-aeed-990eb4083210","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 18:00:13.463706 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 18:00:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:00:14.455139 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:00:14.455149 543705 disk_worker.go:708] disk space is not compliant
W0319 18:00:14.455151 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:00:14.456475 543705 disk_worker.go:494] system disk:vda1
I0319 18:00:14.456517 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:00:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:00:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:00:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:00:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:00:16.472397 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:00:20.345683 543705 disk_info.go:125] begin check local disk info of client
I0319 18:00:20.348287 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:00:20.348295 543705 disk_info.go:196] parse disk info done, disk is : [0xc0005fab40 0xc0005fab80]
E0319 18:00:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:00:23.409770 543705 memory.go:184] no items to output this cycle
I0319 18:00:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 18:00:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:00:33.409774 543705 memory.go:184] no items to output this cycle
I0319 18:00:33.409794 543705 cpu.go:275] no items to output this cycle
I0319 18:00:37.792656 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 18:00:37.792663 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 18:00:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:00:43.410619 543705 memory.go:191] Add success.
I0319 18:00:43.409820 543705 cpu.go:282] Add success.
I0319 18:00:43.420331 543705 net.go:648] Add success.
I0319 18:00:43.422908 543705 net.go:770] primary dev: ETH0
I0319 18:00:43.422923 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:00:43.422937 543705 net.go:698] Add success.
I0319 18:00:46.457993 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:00:46.458073 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:00:46.458100 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:00:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:00:53.409774 543705 memory.go:184] no items to output this cycle
I0319 18:00:53.409785 543705 cpu.go:275] no items to output this cycle
E0319 18:01:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:01:03.409780 543705 memory.go:184] no items to output this cycle
I0319 18:01:03.409792 543705 cpu.go:275] no items to output this cycle
E0319 18:01:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:01:13.409788 543705 memory.go:191] Add success.
W0319 18:01:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 18:01:13.409814 543705 cpu.go:282] Add success.
W0319 18:01:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:01:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:01:13.420319 543705 net.go:648] Add success.
I0319 18:01:13.423101 543705 net.go:770] primary dev: ETH0
I0319 18:01:13.423117 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:01:13.423129 543705 net.go:698] Add success.
I0319 18:01:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:01:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:01:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0319 18:01:14.455182 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:01:14.456558 543705 disk_worker.go:494] system disk:vda1
I0319 18:01:14.456588 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:01:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:01:16.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:01:16.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:01:16.458083 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:01:16.472410 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:01:20.351524 543705 disk_info.go:125] begin check local disk info of client
I0319 18:01:20.354127 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:01:20.354134 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035e780 0xc00035e7c0]
E0319 18:01:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:01:23.409781 543705 cpu.go:275] no items to output this cycle
I0319 18:01:23.409787 543705 memory.go:184] no items to output this cycle
E0319 18:01:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:01:33.409797 543705 memory.go:184] no items to output this cycle
I0319 18:01:33.409813 543705 cpu.go:275] no items to output this cycle
E0319 18:01:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:01:43.409781 543705 memory.go:191] Add success.
I0319 18:01:43.409799 543705 cpu.go:282] Add success.
I0319 18:01:43.419858 543705 net.go:648] Add success.
I0319 18:01:43.422749 543705 net.go:770] primary dev: ETH0
I0319 18:01:43.422762 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:01:43.422775 543705 net.go:698] Add success.
I0319 18:01:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:01:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:01:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:01:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:01:53.409804 543705 memory.go:184] no items to output this cycle
I0319 18:01:53.409817 543705 cpu.go:275] no items to output this cycle
E0319 18:02:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:02:03.409775 543705 memory.go:184] no items to output this cycle
I0319 18:02:03.409780 543705 cpu.go:275] no items to output this cycle
E0319 18:02:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:02:13.409787 543705 memory.go:191] Add success.
I0319 18:02:13.409790 543705 cpu.go:282] Add success.
W0319 18:02:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:02:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:02:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:02:13.420234 543705 net.go:648] Add success.
I0319 18:02:13.423524 543705 net.go:770] primary dev: ETH0
I0319 18:02:13.423537 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:02:13.423549 543705 net.go:698] Add success.
W0319 18:02:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:02:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0319 18:02:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:02:14.456832 543705 disk_worker.go:494] system disk:vda1
I0319 18:02:14.456870 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 18:02:14.457679 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 18:02:14.457688 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 18:02:14.457694 543705 custom_config.go:64] query custom config with name: gpu
E0319 18:02:15.456790 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 18:02:15.456798 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:02:16.457939 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 18:02:16.457938 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 18:02:16.457990 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:02:16.458009 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:02:16.472325 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:02:20.357683 543705 disk_info.go:125] begin check local disk info of client
I0319 18:02:20.360301 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:02:20.360309 543705 disk_info.go:196] parse disk info done, disk is : [0xc00051a640 0xc00051a680]
E0319 18:02:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:02:23.409778 543705 cpu.go:275] no items to output this cycle
I0319 18:02:23.409787 543705 memory.go:184] no items to output this cycle
E0319 18:02:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:02:33.409801 543705 memory.go:184] no items to output this cycle
I0319 18:02:33.409817 543705 cpu.go:275] no items to output this cycle
E0319 18:02:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:02:43.409791 543705 memory.go:191] Add success.
I0319 18:02:43.409820 543705 cpu.go:282] Add success.
I0319 18:02:43.419946 543705 net.go:648] Add success.
I0319 18:02:43.422906 543705 net.go:770] primary dev: ETH0
I0319 18:02:43.422920 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:02:43.422932 543705 net.go:698] Add success.
I0319 18:02:46.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:02:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:02:46.458089 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:02:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:02:53.409803 543705 memory.go:184] no items to output this cycle
I0319 18:02:53.409814 543705 cpu.go:275] no items to output this cycle
E0319 18:03:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:03:03.409780 543705 memory.go:184] no items to output this cycle
I0319 18:03:03.409786 543705 cpu.go:275] no items to output this cycle
E0319 18:03:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:03:13.409809 543705 memory.go:191] Add success.
I0319 18:03:13.409821 543705 cpu.go:282] Add success.
W0319 18:03:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:03:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:03:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:03:13.420113 543705 net.go:648] Add success.
I0319 18:03:13.422934 543705 net.go:770] primary dev: ETH0
I0319 18:03:13.422951 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:03:13.422966 543705 net.go:698] Add success.
I0319 18:03:14.276746 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"383119da-1412-472f-8bbb-4489ca117023","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 18:03:14.276781 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 18:03:14.454706 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:03:14.454862 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:03:14.454872 543705 disk_worker.go:708] disk space is not compliant
W0319 18:03:14.454875 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:03:14.456207 543705 disk_worker.go:494] system disk:vda1
I0319 18:03:14.456260 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:03:15.455974 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:03:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:03:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:03:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:03:16.472465 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:03:20.361682 543705 disk_info.go:125] begin check local disk info of client
I0319 18:03:20.364239 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:03:20.364246 543705 disk_info.go:196] parse disk info done, disk is : [0xc000460740 0xc000460780]
E0319 18:03:23.410257 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:03:23.410272 543705 memory.go:184] no items to output this cycle
I0319 18:03:23.410280 543705 cpu.go:275] no items to output this cycle
E0319 18:03:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:03:33.409771 543705 memory.go:184] no items to output this cycle
I0319 18:03:33.409793 543705 cpu.go:275] no items to output this cycle
I0319 18:03:37.793732 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 18:03:37.793738 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 18:03:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:03:43.410676 543705 memory.go:191] Add success.
I0319 18:03:43.409820 543705 cpu.go:282] Add success.
I0319 18:03:43.420380 543705 net.go:648] Add success.
I0319 18:03:43.423281 543705 net.go:770] primary dev: ETH0
I0319 18:03:43.423294 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:03:43.423307 543705 net.go:698] Add success.
I0319 18:03:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:03:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:03:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:03:53.410189 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:03:53.410207 543705 memory.go:184] no items to output this cycle
I0319 18:03:53.410231 543705 cpu.go:275] no items to output this cycle
E0319 18:04:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:04:03.409793 543705 memory.go:184] no items to output this cycle
I0319 18:04:03.409809 543705 cpu.go:275] no items to output this cycle
E0319 18:04:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:04:13.409785 543705 memory.go:191] Add success.
I0319 18:04:13.409806 543705 cpu.go:282] Add success.
W0319 18:04:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:04:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:04:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:04:13.420278 543705 net.go:648] Add success.
I0319 18:04:13.423121 543705 net.go:770] primary dev: ETH0
I0319 18:04:13.423135 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:04:13.423149 543705 net.go:698] Add success.
I0319 18:04:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:04:14.455257 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:04:14.455329 543705 disk_worker.go:708] disk space is not compliant
W0319 18:04:14.455332 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:04:14.456779 543705 disk_worker.go:494] system disk:vda1
I0319 18:04:14.456826 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:04:15.455950 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:04:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:04:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:04:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:04:16.472369 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:04:20.365685 543705 disk_info.go:125] begin check local disk info of client
I0319 18:04:20.368222 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:04:20.368229 543705 disk_info.go:196] parse disk info done, disk is : [0xc000516480 0xc0005164c0]
E0319 18:04:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:04:23.409799 543705 memory.go:184] no items to output this cycle
I0319 18:04:23.409812 543705 cpu.go:275] no items to output this cycle
E0319 18:04:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:04:33.409778 543705 memory.go:184] no items to output this cycle
I0319 18:04:33.409786 543705 cpu.go:275] no items to output this cycle
E0319 18:04:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:04:43.409789 543705 memory.go:191] Add success.
I0319 18:04:43.409804 543705 cpu.go:282] Add success.
I0319 18:04:43.419893 543705 net.go:648] Add success.
I0319 18:04:43.422698 543705 net.go:770] primary dev: ETH0
I0319 18:04:43.422712 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:04:43.422724 543705 net.go:698] Add success.
I0319 18:04:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:04:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:04:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:04:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:04:53.409805 543705 memory.go:184] no items to output this cycle
I0319 18:04:53.409820 543705 cpu.go:275] no items to output this cycle
E0319 18:05:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:05:03.409785 543705 cpu.go:275] no items to output this cycle
I0319 18:05:03.409789 543705 memory.go:184] no items to output this cycle
E0319 18:05:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:05:13.409811 543705 memory.go:191] Add success.
I0319 18:05:13.409818 543705 cpu.go:282] Add success.
W0319 18:05:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:05:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:05:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:05:13.420054 543705 net.go:648] Add success.
I0319 18:05:13.422979 543705 net.go:770] primary dev: ETH0
I0319 18:05:13.422996 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:05:13.423009 543705 net.go:698] Add success.
I0319 18:05:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:05:14.455154 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:05:14.455163 543705 disk_worker.go:708] disk space is not compliant
W0319 18:05:14.455166 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:05:14.456490 543705 disk_worker.go:494] system disk:vda1
I0319 18:05:14.456536 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:05:15.456010 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:05:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:05:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:05:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:05:16.472381 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:05:20.369682 543705 disk_info.go:125] begin check local disk info of client
I0319 18:05:20.372234 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:05:20.372241 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032a100 0xc00032a140]
E0319 18:05:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:05:23.409770 543705 memory.go:184] no items to output this cycle
I0319 18:05:23.409812 543705 cpu.go:275] no items to output this cycle
E0319 18:05:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:05:33.409808 543705 memory.go:184] no items to output this cycle
I0319 18:05:33.409821 543705 cpu.go:275] no items to output this cycle
E0319 18:05:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:05:43.409780 543705 memory.go:191] Add success.
I0319 18:05:43.409798 543705 cpu.go:282] Add success.
I0319 18:05:43.419881 543705 net.go:648] Add success.
I0319 18:05:43.422717 543705 net.go:770] primary dev: ETH0
I0319 18:05:43.422730 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:05:43.422743 543705 net.go:698] Add success.
I0319 18:05:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:05:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:05:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:05:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:05:53.409780 543705 memory.go:184] no items to output this cycle
I0319 18:05:53.409786 543705 cpu.go:275] no items to output this cycle
E0319 18:06:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:06:03.409770 543705 memory.go:184] no items to output this cycle
I0319 18:06:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 18:06:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:06:13.409787 543705 memory.go:191] Add success.
I0319 18:06:13.409805 543705 cpu.go:282] Add success.
W0319 18:06:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:06:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:06:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:06:13.420055 543705 net.go:648] Add success.
I0319 18:06:13.423927 543705 net.go:770] primary dev: ETH0
I0319 18:06:13.423939 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:06:13.423951 543705 net.go:698] Add success.
I0319 18:06:13.466503 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5688c392-168a-4280-8807-d47b4962eb00","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 18:06:13.466540 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 18:06:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:06:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:06:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0319 18:06:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:06:14.456847 543705 disk_worker.go:494] system disk:vda1
I0319 18:06:14.456876 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:06:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:06:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:06:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:06:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:06:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:06:20.373693 543705 disk_info.go:125] begin check local disk info of client
I0319 18:06:20.376240 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:06:20.376247 543705 disk_info.go:196] parse disk info done, disk is : [0xc000464e80 0xc000464ec0]
E0319 18:06:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:06:23.409803 543705 memory.go:184] no items to output this cycle
I0319 18:06:23.409815 543705 cpu.go:275] no items to output this cycle
E0319 18:06:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:06:33.409786 543705 memory.go:184] no items to output this cycle
I0319 18:06:33.409813 543705 cpu.go:275] no items to output this cycle
I0319 18:06:37.795668 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 18:06:37.795674 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 18:06:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:06:43.410744 543705 memory.go:191] Add success.
I0319 18:06:43.409796 543705 cpu.go:282] Add success.
I0319 18:06:43.420535 543705 net.go:648] Add success.
I0319 18:06:43.423300 543705 net.go:770] primary dev: ETH0
I0319 18:06:43.423313 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:06:43.423325 543705 net.go:698] Add success.
I0319 18:06:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:06:46.458073 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:06:46.458101 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:06:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:06:53.409784 543705 memory.go:184] no items to output this cycle
I0319 18:06:53.409814 543705 cpu.go:275] no items to output this cycle
E0319 18:07:03.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:07:03.409819 543705 memory.go:184] no items to output this cycle
I0319 18:07:03.409826 543705 cpu.go:275] no items to output this cycle
E0319 18:07:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:07:13.409788 543705 memory.go:191] Add success.
W0319 18:07:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 18:07:13.409819 543705 cpu.go:282] Add success.
W0319 18:07:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:07:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:07:13.420428 543705 net.go:648] Add success.
I0319 18:07:13.423198 543705 net.go:770] primary dev: ETH0
I0319 18:07:13.423211 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:07:13.423224 543705 net.go:698] Add success.
I0319 18:07:13.452883 543705 event_worker.go:152] Polling the log file for events...
W0319 18:07:14.455164 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:07:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0319 18:07:14.455178 543705 disk_worker.go:728] disk inode is not compliant
E0319 18:07:14.456155 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 18:07:14.456165 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 18:07:14.456171 543705 custom_config.go:64] query custom config with name: gpu
I0319 18:07:14.456427 543705 disk_worker.go:494] system disk:vda1
I0319 18:07:14.456459 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 18:07:15.456875 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 18:07:15.456883 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:07:16.457959 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 18:07:16.457962 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 18:07:16.458012 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:07:16.458031 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:07:16.472350 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:07:20.377685 543705 disk_info.go:125] begin check local disk info of client
I0319 18:07:20.380205 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:07:20.380211 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002be200 0xc0002be240]
E0319 18:07:23.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:07:23.409805 543705 memory.go:184] no items to output this cycle
I0319 18:07:23.409818 543705 cpu.go:275] no items to output this cycle
E0319 18:07:33.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:07:33.409818 543705 memory.go:184] no items to output this cycle
I0319 18:07:33.409834 543705 cpu.go:275] no items to output this cycle
E0319 18:07:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:07:43.409798 543705 memory.go:191] Add success.
I0319 18:07:43.409831 543705 cpu.go:282] Add success.
I0319 18:07:43.419897 543705 net.go:648] Add success.
I0319 18:07:43.422795 543705 net.go:770] primary dev: ETH0
I0319 18:07:43.422811 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:07:43.422824 543705 net.go:698] Add success.
I0319 18:07:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:07:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:07:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:07:53.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:07:53.409815 543705 memory.go:184] no items to output this cycle
I0319 18:07:53.409825 543705 cpu.go:275] no items to output this cycle
E0319 18:08:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:08:03.409792 543705 memory.go:184] no items to output this cycle
I0319 18:08:03.409800 543705 cpu.go:275] no items to output this cycle
E0319 18:08:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:08:13.409796 543705 memory.go:191] Add success.
I0319 18:08:13.409806 543705 cpu.go:282] Add success.
W0319 18:08:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:08:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:08:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:08:13.420161 543705 net.go:648] Add success.
I0319 18:08:13.423019 543705 net.go:770] primary dev: ETH0
I0319 18:08:13.423032 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:08:13.423043 543705 net.go:698] Add success.
I0319 18:08:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:08:14.455158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:08:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0319 18:08:14.455171 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:08:14.456503 543705 disk_worker.go:494] system disk:vda1
I0319 18:08:14.456545 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:08:15.455979 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:08:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:08:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:08:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:08:16.472426 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:08:20.381689 543705 disk_info.go:125] begin check local disk info of client
I0319 18:08:20.384244 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:08:20.384251 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a4000 0xc0002a4400]
E0319 18:08:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:08:23.409810 543705 memory.go:184] no items to output this cycle
I0319 18:08:23.409823 543705 cpu.go:275] no items to output this cycle
E0319 18:08:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:08:33.409801 543705 memory.go:184] no items to output this cycle
I0319 18:08:33.409828 543705 cpu.go:275] no items to output this cycle
E0319 18:08:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:08:43.409786 543705 memory.go:191] Add success.
I0319 18:08:43.409810 543705 cpu.go:282] Add success.
I0319 18:08:43.419913 543705 net.go:648] Add success.
I0319 18:08:43.422707 543705 net.go:770] primary dev: ETH0
I0319 18:08:43.422722 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:08:43.422736 543705 net.go:698] Add success.
I0319 18:08:46.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:08:46.458065 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:08:46.458098 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:08:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:08:53.409804 543705 memory.go:184] no items to output this cycle
I0319 18:08:53.409815 543705 cpu.go:275] no items to output this cycle
E0319 18:09:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:09:03.409780 543705 memory.go:184] no items to output this cycle
I0319 18:09:03.409780 543705 cpu.go:275] no items to output this cycle
E0319 18:09:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:09:13.409788 543705 memory.go:191] Add success.
I0319 18:09:13.409797 543705 cpu.go:282] Add success.
W0319 18:09:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:09:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:09:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:09:13.420122 543705 net.go:648] Add success.
I0319 18:09:13.423125 543705 net.go:770] primary dev: ETH0
I0319 18:09:13.423140 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:09:13.423151 543705 net.go:698] Add success.
I0319 18:09:13.715845 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7c364380-0560-4dd6-ae0d-ad9122267b45","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 18:09:13.715890 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 18:09:14.454606 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:09:14.454806 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:09:14.454816 543705 disk_worker.go:708] disk space is not compliant
W0319 18:09:14.454819 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:09:14.456181 543705 disk_worker.go:494] system disk:vda1
I0319 18:09:14.456236 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:09:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:09:16.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:09:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:09:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:09:16.472394 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:09:20.385676 543705 disk_info.go:125] begin check local disk info of client
I0319 18:09:20.388220 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:09:20.388226 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a1c40 0xc0002a1c80]
E0319 18:09:23.409864 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:09:23.409891 543705 memory.go:184] no items to output this cycle
I0319 18:09:23.409973 543705 cpu.go:275] no items to output this cycle
E0319 18:09:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:09:33.409802 543705 memory.go:184] no items to output this cycle
I0319 18:09:33.409823 543705 cpu.go:275] no items to output this cycle
I0319 18:09:37.795809 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 18:09:37.795816 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 18:09:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:09:43.410607 543705 memory.go:191] Add success.
I0319 18:09:43.409815 543705 cpu.go:282] Add success.
I0319 18:09:43.420382 543705 net.go:648] Add success.
I0319 18:09:43.423043 543705 net.go:770] primary dev: ETH0
I0319 18:09:43.423058 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:09:43.423072 543705 net.go:698] Add success.
I0319 18:09:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:09:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:09:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:09:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:09:53.409782 543705 memory.go:184] no items to output this cycle
I0319 18:09:53.409786 543705 cpu.go:275] no items to output this cycle
E0319 18:10:03.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:10:03.409764 543705 memory.go:184] no items to output this cycle
I0319 18:10:03.409803 543705 cpu.go:275] no items to output this cycle
E0319 18:10:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:10:13.409795 543705 memory.go:191] Add success.
I0319 18:10:13.409795 543705 cpu.go:282] Add success.
W0319 18:10:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:10:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:10:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:10:13.420146 543705 net.go:648] Add success.
I0319 18:10:13.422973 543705 net.go:770] primary dev: ETH0
I0319 18:10:13.422986 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:10:13.422999 543705 net.go:698] Add success.
I0319 18:10:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:10:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:10:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 18:10:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:10:14.456555 543705 disk_worker.go:494] system disk:vda1
I0319 18:10:14.456585 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:10:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:10:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:10:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:10:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:10:16.472388 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:10:20.389680 543705 disk_info.go:125] begin check local disk info of client
I0319 18:10:20.392241 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:10:20.392249 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039fd40 0xc00039fd80]
E0319 18:10:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:10:23.409798 543705 memory.go:184] no items to output this cycle
I0319 18:10:23.409809 543705 cpu.go:275] no items to output this cycle
E0319 18:10:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:10:33.409790 543705 memory.go:184] no items to output this cycle
I0319 18:10:33.409797 543705 cpu.go:275] no items to output this cycle
E0319 18:10:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:10:43.409776 543705 memory.go:191] Add success.
I0319 18:10:43.409817 543705 cpu.go:282] Add success.
I0319 18:10:43.420004 543705 net.go:648] Add success.
I0319 18:10:43.423082 543705 net.go:770] primary dev: ETH0
I0319 18:10:43.423096 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:10:43.423108 543705 net.go:698] Add success.
I0319 18:10:46.457996 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:10:46.458070 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:10:46.458098 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:10:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:10:53.409769 543705 memory.go:184] no items to output this cycle
I0319 18:10:53.409806 543705 cpu.go:275] no items to output this cycle
E0319 18:11:03.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:11:03.409760 543705 memory.go:184] no items to output this cycle
I0319 18:11:03.409802 543705 cpu.go:275] no items to output this cycle
E0319 18:11:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:11:13.409790 543705 memory.go:191] Add success.
I0319 18:11:13.409795 543705 cpu.go:282] Add success.
W0319 18:11:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:11:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:11:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:11:13.420160 543705 net.go:648] Add success.
I0319 18:11:13.423322 543705 net.go:770] primary dev: ETH0
I0319 18:11:13.423340 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:11:13.423354 543705 net.go:698] Add success.
I0319 18:11:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:11:14.455106 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:11:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0319 18:11:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:11:14.456501 543705 disk_worker.go:494] system disk:vda1
I0319 18:11:14.456547 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:11:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:11:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:11:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:11:16.458074 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:11:16.472439 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:11:20.393681 543705 disk_info.go:125] begin check local disk info of client
I0319 18:11:20.396458 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:11:20.396466 543705 disk_info.go:196] parse disk info done, disk is : [0xc000394340 0xc000394380]
E0319 18:11:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:11:23.409798 543705 memory.go:184] no items to output this cycle
I0319 18:11:23.409809 543705 cpu.go:275] no items to output this cycle
E0319 18:11:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:11:33.409791 543705 memory.go:184] no items to output this cycle
I0319 18:11:33.409795 543705 cpu.go:275] no items to output this cycle
E0319 18:11:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:11:43.409797 543705 memory.go:191] Add success.
I0319 18:11:43.409797 543705 cpu.go:282] Add success.
I0319 18:11:43.419975 543705 net.go:648] Add success.
I0319 18:11:43.422843 543705 net.go:770] primary dev: ETH0
I0319 18:11:43.422856 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:11:43.422870 543705 net.go:698] Add success.
I0319 18:11:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:11:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:11:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:11:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:11:53.409775 543705 memory.go:184] no items to output this cycle
I0319 18:11:53.409798 543705 cpu.go:275] no items to output this cycle
E0319 18:12:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:12:03.409777 543705 memory.go:184] no items to output this cycle
I0319 18:12:03.409786 543705 cpu.go:275] no items to output this cycle
E0319 18:12:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:12:13.409778 543705 memory.go:191] Add success.
W0319 18:12:13.409803 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 18:12:13.409810 543705 cpu.go:282] Add success.
W0319 18:12:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:12:13.409817 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:12:13.420047 543705 net.go:648] Add success.
I0319 18:12:13.422873 543705 net.go:770] primary dev: ETH0
I0319 18:12:13.422887 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:12:13.422899 543705 net.go:698] Add success.
I0319 18:12:13.463982 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b66597e4-e885-42a5-af03-b72889c1e6e0","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 18:12:13.464013 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 18:12:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:12:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0319 18:12:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:12:14.456868 543705 disk_worker.go:494] system disk:vda1
E0319 18:12:14.456874 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 18:12:14.456882 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 18:12:14.456887 543705 custom_config.go:64] query custom config with name: gpu
I0319 18:12:14.456909 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 18:12:15.456510 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 18:12:15.456518 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:12:16.457902 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 18:12:16.457903 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 18:12:16.457956 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:12:16.457976 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:12:16.472401 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:12:20.397683 543705 disk_info.go:125] begin check local disk info of client
I0319 18:12:20.400321 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:12:20.400329 543705 disk_info.go:196] parse disk info done, disk is : [0xc000462ac0 0xc000462b00]
E0319 18:12:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:12:23.409780 543705 memory.go:184] no items to output this cycle
I0319 18:12:23.409784 543705 cpu.go:275] no items to output this cycle
I0319 18:12:33.409886 543705 cpu.go:275] no items to output this cycle
E0319 18:12:33.409931 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:12:33.409951 543705 memory.go:184] no items to output this cycle
I0319 18:12:37.797682 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 18:12:37.797688 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 18:12:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:12:43.410770 543705 memory.go:191] Add success.
I0319 18:12:43.409798 543705 cpu.go:282] Add success.
I0319 18:12:43.420454 543705 net.go:648] Add success.
I0319 18:12:43.423404 543705 net.go:770] primary dev: ETH0
I0319 18:12:43.423419 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:12:43.423434 543705 net.go:698] Add success.
I0319 18:12:46.458000 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:12:46.458066 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:12:46.458093 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:12:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:12:53.409787 543705 memory.go:184] no items to output this cycle
I0319 18:12:53.409790 543705 cpu.go:275] no items to output this cycle
E0319 18:13:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:13:03.409770 543705 memory.go:184] no items to output this cycle
I0319 18:13:03.409809 543705 cpu.go:275] no items to output this cycle
E0319 18:13:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:13:13.409777 543705 memory.go:191] Add success.
W0319 18:13:13.409802 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:13:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:13:13.409819 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:13:13.409827 543705 cpu.go:282] Add success.
I0319 18:13:13.420162 543705 net.go:648] Add success.
I0319 18:13:13.422814 543705 net.go:770] primary dev: ETH0
I0319 18:13:13.422830 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:13:13.422843 543705 net.go:698] Add success.
I0319 18:13:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:13:14.455204 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:13:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0319 18:13:14.455216 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:13:14.456597 543705 disk_worker.go:494] system disk:vda1
I0319 18:13:14.456627 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:13:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:13:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:13:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:13:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:13:16.472436 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:13:20.401686 543705 disk_info.go:125] begin check local disk info of client
I0319 18:13:20.404272 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:13:20.404280 543705 disk_info.go:196] parse disk info done, disk is : [0xc000579580 0xc0005795c0]
E0319 18:13:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:13:23.409811 543705 memory.go:184] no items to output this cycle
I0319 18:13:23.409814 543705 cpu.go:275] no items to output this cycle
E0319 18:13:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:13:33.409793 543705 memory.go:184] no items to output this cycle
I0319 18:13:33.409797 543705 cpu.go:275] no items to output this cycle
E0319 18:13:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:13:43.409823 543705 memory.go:191] Add success.
I0319 18:13:43.409825 543705 cpu.go:282] Add success.
I0319 18:13:43.419954 543705 net.go:648] Add success.
I0319 18:13:43.423105 543705 net.go:770] primary dev: ETH0
I0319 18:13:43.423118 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:13:43.423130 543705 net.go:698] Add success.
I0319 18:13:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:13:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:13:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:13:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:13:53.409806 543705 memory.go:184] no items to output this cycle
I0319 18:13:53.409815 543705 cpu.go:275] no items to output this cycle
E0319 18:14:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:14:03.409774 543705 memory.go:184] no items to output this cycle
I0319 18:14:03.409786 543705 cpu.go:275] no items to output this cycle
E0319 18:14:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:14:13.409788 543705 memory.go:191] Add success.
I0319 18:14:13.409797 543705 cpu.go:282] Add success.
W0319 18:14:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:14:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:14:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:14:13.420161 543705 net.go:648] Add success.
I0319 18:14:13.423205 543705 net.go:770] primary dev: ETH0
I0319 18:14:13.423228 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:14:13.423241 543705 net.go:698] Add success.
I0319 18:14:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:14:14.455090 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:14:14.455152 543705 disk_worker.go:708] disk space is not compliant
W0319 18:14:14.455155 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:14:14.456503 543705 disk_worker.go:494] system disk:vda1
I0319 18:14:14.456548 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:14:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:14:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:14:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:14:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:14:16.472408 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:14:20.405697 543705 disk_info.go:125] begin check local disk info of client
I0319 18:14:20.408194 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:14:20.408202 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003c8480 0xc0003c84c0]
I0319 18:14:23.409895 543705 cpu.go:275] no items to output this cycle
E0319 18:14:23.409976 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:14:23.409992 543705 memory.go:184] no items to output this cycle
E0319 18:14:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:14:33.409791 543705 memory.go:184] no items to output this cycle
I0319 18:14:33.409801 543705 cpu.go:275] no items to output this cycle
E0319 18:14:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:14:43.409796 543705 memory.go:191] Add success.
I0319 18:14:43.409803 543705 cpu.go:282] Add success.
I0319 18:14:43.419961 543705 net.go:648] Add success.
I0319 18:14:43.422762 543705 net.go:770] primary dev: ETH0
I0319 18:14:43.422776 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:14:43.422788 543705 net.go:698] Add success.
I0319 18:14:46.457996 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:14:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:14:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:14:53.410257 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:14:53.410282 543705 memory.go:184] no items to output this cycle
I0319 18:14:53.410286 543705 cpu.go:275] no items to output this cycle
E0319 18:15:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:15:03.409777 543705 memory.go:184] no items to output this cycle
I0319 18:15:03.409783 543705 cpu.go:275] no items to output this cycle
E0319 18:15:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:15:13.409779 543705 memory.go:191] Add success.
W0319 18:15:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:15:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:15:13.409817 543705 cpu.go:282] Add success.
I0319 18:15:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:15:13.420049 543705 net.go:648] Add success.
I0319 18:15:13.423277 543705 net.go:770] primary dev: ETH0
I0319 18:15:13.423290 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:15:13.423302 543705 net.go:698] Add success.
I0319 18:15:13.468334 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c865989b-088d-4c99-915d-6d64718139ff","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 18:15:13.468378 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 18:15:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:15:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:15:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 18:15:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:15:14.456739 543705 disk_worker.go:494] system disk:vda1
I0319 18:15:14.456766 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:15:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:15:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:15:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:15:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:15:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:15:20.409700 543705 disk_info.go:125] begin check local disk info of client
I0319 18:15:20.412143 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:15:20.412152 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a2100 0xc0002a2140]
E0319 18:15:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:15:23.409782 543705 memory.go:184] no items to output this cycle
I0319 18:15:23.409810 543705 cpu.go:275] no items to output this cycle
E0319 18:15:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:15:33.409781 543705 memory.go:184] no items to output this cycle
I0319 18:15:33.409787 543705 cpu.go:275] no items to output this cycle
I0319 18:15:37.799684 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 18:15:37.799690 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 18:15:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:15:43.410680 543705 memory.go:191] Add success.
I0319 18:15:43.409808 543705 cpu.go:282] Add success.
I0319 18:15:43.420392 543705 net.go:648] Add success.
I0319 18:15:43.423271 543705 net.go:770] primary dev: ETH0
I0319 18:15:43.423285 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:15:43.423297 543705 net.go:698] Add success.
I0319 18:15:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:15:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:15:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:15:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:15:53.409783 543705 memory.go:184] no items to output this cycle
I0319 18:15:53.409785 543705 cpu.go:275] no items to output this cycle
E0319 18:16:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:16:03.409800 543705 memory.go:184] no items to output this cycle
I0319 18:16:03.409815 543705 cpu.go:275] no items to output this cycle
E0319 18:16:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:16:13.409816 543705 memory.go:191] Add success.
I0319 18:16:13.409824 543705 cpu.go:282] Add success.
W0319 18:16:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:16:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:16:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:16:13.420137 543705 net.go:648] Add success.
I0319 18:16:13.423182 543705 net.go:770] primary dev: ETH0
I0319 18:16:13.423196 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:16:13.423209 543705 net.go:698] Add success.
I0319 18:16:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:16:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:16:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0319 18:16:14.455171 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:16:14.456542 543705 disk_worker.go:494] system disk:vda1
I0319 18:16:14.456588 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:16:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:16:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:16:16.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:16:16.458080 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:16:16.472489 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:16:20.412824 543705 disk_info.go:125] begin check local disk info of client
I0319 18:16:20.416301 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:16:20.416311 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004dc380 0xc0004dc3c0]
E0319 18:16:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:16:23.409784 543705 cpu.go:275] no items to output this cycle
I0319 18:16:23.409795 543705 memory.go:184] no items to output this cycle
E0319 18:16:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:16:33.409774 543705 memory.go:184] no items to output this cycle
I0319 18:16:33.409799 543705 cpu.go:275] no items to output this cycle
E0319 18:16:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:16:43.409816 543705 memory.go:191] Add success.
I0319 18:16:43.409824 543705 cpu.go:282] Add success.
I0319 18:16:43.419893 543705 net.go:648] Add success.
I0319 18:16:43.422522 543705 net.go:770] primary dev: ETH0
I0319 18:16:43.422535 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:16:43.422548 543705 net.go:698] Add success.
I0319 18:16:46.458024 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:16:46.458102 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:16:46.458139 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:16:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:16:53.409806 543705 memory.go:184] no items to output this cycle
I0319 18:16:53.409814 543705 cpu.go:275] no items to output this cycle
E0319 18:17:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:17:03.409780 543705 cpu.go:275] no items to output this cycle
I0319 18:17:03.409784 543705 memory.go:184] no items to output this cycle
E0319 18:17:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:17:13.409825 543705 memory.go:191] Add success.
I0319 18:17:13.409827 543705 cpu.go:282] Add success.
W0319 18:17:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:17:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:17:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:17:13.420127 543705 net.go:648] Add success.
I0319 18:17:13.422788 543705 net.go:770] primary dev: ETH0
I0319 18:17:13.422802 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:17:13.422815 543705 net.go:698] Add success.
I0319 18:17:13.453352 543705 event_worker.go:152] Polling the log file for events...
W0319 18:17:14.455139 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:17:14.455150 543705 disk_worker.go:708] disk space is not compliant
W0319 18:17:14.455153 543705 disk_worker.go:728] disk inode is not compliant
E0319 18:17:14.456884 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 18:17:14.456893 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 18:17:14.456899 543705 custom_config.go:64] query custom config with name: gpu
I0319 18:17:14.456971 543705 disk_worker.go:494] system disk:vda1
I0319 18:17:14.457014 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 18:17:15.456817 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 18:17:15.456825 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:17:16.457957 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 18:17:16.457957 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 18:17:16.458013 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:17:16.458033 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:17:16.472372 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:17:20.416803 543705 disk_info.go:125] begin check local disk info of client
I0319 18:17:20.419344 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:17:20.419351 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039e100 0xc00039e180]
E0319 18:17:23.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:17:23.409807 543705 memory.go:184] no items to output this cycle
I0319 18:17:23.409818 543705 cpu.go:275] no items to output this cycle
E0319 18:17:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:17:33.409783 543705 cpu.go:275] no items to output this cycle
I0319 18:17:33.409785 543705 memory.go:184] no items to output this cycle
E0319 18:17:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:17:43.409785 543705 memory.go:191] Add success.
I0319 18:17:43.409789 543705 cpu.go:282] Add success.
I0319 18:17:43.419867 543705 net.go:648] Add success.
I0319 18:17:43.422745 543705 net.go:770] primary dev: ETH0
I0319 18:17:43.422759 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:17:43.422772 543705 net.go:698] Add success.
I0319 18:17:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:17:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:17:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:17:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:17:53.409785 543705 memory.go:184] no items to output this cycle
I0319 18:17:53.409791 543705 cpu.go:275] no items to output this cycle
E0319 18:18:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:18:03.409778 543705 memory.go:184] no items to output this cycle
I0319 18:18:03.409784 543705 cpu.go:275] no items to output this cycle
E0319 18:18:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:18:13.409781 543705 memory.go:191] Add success.
I0319 18:18:13.409804 543705 cpu.go:282] Add success.
W0319 18:18:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:18:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:18:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:18:13.420137 543705 net.go:648] Add success.
I0319 18:18:13.423038 543705 net.go:770] primary dev: ETH0
I0319 18:18:13.423051 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:18:13.423064 543705 net.go:698] Add success.
I0319 18:18:13.468928 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"131617d5-1a29-45f2-a3ff-8419dfc132ff","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 18:18:13.468960 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 18:18:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:18:14.455145 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:18:14.455154 543705 disk_worker.go:708] disk space is not compliant
W0319 18:18:14.455157 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:18:14.456503 543705 disk_worker.go:494] system disk:vda1
I0319 18:18:14.456531 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:18:15.455981 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:18:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:18:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:18:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:18:16.472402 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:18:20.419816 543705 disk_info.go:125] begin check local disk info of client
I0319 18:18:20.422272 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:18:20.422280 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039fcc0 0xc00039fd00]
E0319 18:18:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:18:23.409791 543705 memory.go:184] no items to output this cycle
I0319 18:18:23.409791 543705 cpu.go:275] no items to output this cycle
E0319 18:18:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:18:33.409768 543705 memory.go:184] no items to output this cycle
I0319 18:18:33.409801 543705 cpu.go:275] no items to output this cycle
I0319 18:18:37.801716 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 18:18:37.801724 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 18:18:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:18:43.410753 543705 memory.go:191] Add success.
I0319 18:18:43.409801 543705 cpu.go:282] Add success.
I0319 18:18:43.420458 543705 net.go:648] Add success.
I0319 18:18:43.423317 543705 net.go:770] primary dev: ETH0
I0319 18:18:43.423331 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:18:43.423343 543705 net.go:698] Add success.
I0319 18:18:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:18:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:18:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:18:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:18:53.409785 543705 memory.go:184] no items to output this cycle
I0319 18:18:53.409792 543705 cpu.go:275] no items to output this cycle
E0319 18:19:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:19:03.409795 543705 memory.go:184] no items to output this cycle
I0319 18:19:03.409809 543705 cpu.go:275] no items to output this cycle
E0319 18:19:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:19:13.409792 543705 memory.go:191] Add success.
I0319 18:19:13.409794 543705 cpu.go:282] Add success.
W0319 18:19:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:19:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:19:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:19:13.420138 543705 net.go:648] Add success.
I0319 18:19:13.422559 543705 net.go:770] primary dev: ETH0
I0319 18:19:13.422572 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:19:13.422586 543705 net.go:698] Add success.
I0319 18:19:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:19:14.455346 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:19:14.455427 543705 disk_worker.go:708] disk space is not compliant
W0319 18:19:14.455431 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:19:14.457046 543705 disk_worker.go:494] system disk:vda1
I0319 18:19:14.457074 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:19:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:19:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:19:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:19:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:19:16.472390 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:19:20.422815 543705 disk_info.go:125] begin check local disk info of client
I0319 18:19:20.425312 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:19:20.425322 543705 disk_info.go:196] parse disk info done, disk is : [0xc000342100 0xc000342140]
E0319 18:19:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:19:23.409792 543705 memory.go:184] no items to output this cycle
I0319 18:19:23.409794 543705 cpu.go:275] no items to output this cycle
E0319 18:19:33.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:19:33.409763 543705 memory.go:184] no items to output this cycle
I0319 18:19:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 18:19:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:19:43.409780 543705 memory.go:191] Add success.
I0319 18:19:43.409799 543705 cpu.go:282] Add success.
I0319 18:19:43.419687 543705 net.go:770] primary dev: ETH0
I0319 18:19:43.419700 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:19:43.419714 543705 net.go:698] Add success.
I0319 18:19:43.419945 543705 net.go:648] Add success.
I0319 18:19:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:19:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:19:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:19:53.410327 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:19:53.410346 543705 memory.go:184] no items to output this cycle
I0319 18:19:53.410369 543705 cpu.go:275] no items to output this cycle
E0319 18:20:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:20:03.409798 543705 memory.go:184] no items to output this cycle
I0319 18:20:03.409806 543705 cpu.go:275] no items to output this cycle
E0319 18:20:13.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:20:13.409776 543705 memory.go:191] Add success.
W0319 18:20:13.409802 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 18:20:13.409807 543705 cpu.go:282] Add success.
W0319 18:20:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:20:13.409815 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:20:13.420158 543705 net.go:648] Add success.
I0319 18:20:13.423049 543705 net.go:770] primary dev: ETH0
I0319 18:20:13.423078 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:20:13.423090 543705 net.go:698] Add success.
I0319 18:20:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:20:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:20:14.455301 543705 disk_worker.go:708] disk space is not compliant
W0319 18:20:14.455307 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:20:14.456960 543705 disk_worker.go:494] system disk:vda1
I0319 18:20:14.456988 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:20:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:20:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:20:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:20:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:20:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:20:20.425826 543705 disk_info.go:125] begin check local disk info of client
I0319 18:20:20.428280 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:20:20.428286 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004c3d40 0xc0004c3d80]
E0319 18:20:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:20:23.409809 543705 memory.go:184] no items to output this cycle
I0319 18:20:23.409815 543705 cpu.go:275] no items to output this cycle
E0319 18:20:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:20:33.409777 543705 memory.go:184] no items to output this cycle
I0319 18:20:33.409783 543705 cpu.go:275] no items to output this cycle
E0319 18:20:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:20:43.409811 543705 memory.go:191] Add success.
I0319 18:20:43.409814 543705 cpu.go:282] Add success.
I0319 18:20:43.420033 543705 net.go:648] Add success.
I0319 18:20:43.422961 543705 net.go:770] primary dev: ETH0
I0319 18:20:43.422975 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:20:43.422993 543705 net.go:698] Add success.
I0319 18:20:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:20:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:20:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:20:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:20:53.409783 543705 memory.go:184] no items to output this cycle
I0319 18:20:53.409799 543705 cpu.go:275] no items to output this cycle
E0319 18:21:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:21:03.409767 543705 memory.go:184] no items to output this cycle
I0319 18:21:03.409797 543705 cpu.go:275] no items to output this cycle
E0319 18:21:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:21:13.409782 543705 memory.go:191] Add success.
I0319 18:21:13.409802 543705 cpu.go:282] Add success.
W0319 18:21:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:21:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:21:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:21:13.420036 543705 net.go:648] Add success.
I0319 18:21:13.422790 543705 net.go:770] primary dev: ETH0
I0319 18:21:13.422805 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:21:13.422817 543705 net.go:698] Add success.
I0319 18:21:13.468880 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"44d7d0c3-f62f-4ac6-be99-2b731370d589","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 18:21:13.468911 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 18:21:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:21:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:21:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0319 18:21:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:21:14.456537 543705 disk_worker.go:494] system disk:vda1
I0319 18:21:14.456577 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:21:15.455793 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:21:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:21:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:21:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:21:16.472377 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:21:20.428826 543705 disk_info.go:125] begin check local disk info of client
I0319 18:21:20.431436 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:21:20.431444 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004c38c0 0xc0004c3900]
E0319 18:21:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:21:23.409787 543705 memory.go:184] no items to output this cycle
I0319 18:21:23.409788 543705 cpu.go:275] no items to output this cycle
E0319 18:21:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:21:33.409797 543705 memory.go:184] no items to output this cycle
I0319 18:21:33.409808 543705 cpu.go:275] no items to output this cycle
I0319 18:21:37.803711 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 18:21:37.803718 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 18:21:43.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:21:43.410645 543705 memory.go:191] Add success.
I0319 18:21:43.409806 543705 cpu.go:282] Add success.
I0319 18:21:43.420360 543705 net.go:648] Add success.
I0319 18:21:43.423135 543705 net.go:770] primary dev: ETH0
I0319 18:21:43.423149 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:21:43.423169 543705 net.go:698] Add success.
I0319 18:21:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:21:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:21:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:21:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:21:53.409768 543705 memory.go:184] no items to output this cycle
I0319 18:21:53.409792 543705 cpu.go:275] no items to output this cycle
E0319 18:22:03.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:22:03.409761 543705 memory.go:184] no items to output this cycle
I0319 18:22:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 18:22:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:22:13.409787 543705 memory.go:191] Add success.
I0319 18:22:13.409791 543705 cpu.go:282] Add success.
W0319 18:22:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:22:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:22:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:22:13.420143 543705 net.go:648] Add success.
I0319 18:22:13.423020 543705 net.go:770] primary dev: ETH0
I0319 18:22:13.423039 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:22:13.423053 543705 net.go:698] Add success.
W0319 18:22:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:22:14.455160 543705 disk_worker.go:708] disk space is not compliant
W0319 18:22:14.455163 543705 disk_worker.go:728] disk inode is not compliant
E0319 18:22:14.456969 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 18:22:14.456978 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 18:22:14.456984 543705 custom_config.go:64] query custom config with name: gpu
I0319 18:22:14.457023 543705 disk_worker.go:494] system disk:vda1
I0319 18:22:14.457052 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 18:22:15.456813 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 18:22:15.456821 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:22:16.457952 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 18:22:16.457952 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 18:22:16.458008 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:22:16.458027 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:22:16.472397 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:22:20.431852 543705 disk_info.go:125] begin check local disk info of client
I0319 18:22:20.434380 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:22:20.434386 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a3d00 0xc0002a3d40]
E0319 18:22:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:22:23.409811 543705 memory.go:184] no items to output this cycle
I0319 18:22:23.409821 543705 cpu.go:275] no items to output this cycle
E0319 18:22:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:22:33.409800 543705 memory.go:184] no items to output this cycle
I0319 18:22:33.409814 543705 cpu.go:275] no items to output this cycle
E0319 18:22:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:22:43.409783 543705 memory.go:191] Add success.
I0319 18:22:43.409802 543705 cpu.go:282] Add success.
I0319 18:22:43.419991 543705 net.go:648] Add success.
I0319 18:22:43.422750 543705 net.go:770] primary dev: ETH0
I0319 18:22:43.422765 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:22:43.422779 543705 net.go:698] Add success.
I0319 18:22:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:22:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:22:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:22:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:22:53.409792 543705 memory.go:184] no items to output this cycle
I0319 18:22:53.409794 543705 cpu.go:275] no items to output this cycle
E0319 18:23:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:23:03.409776 543705 memory.go:184] no items to output this cycle
I0319 18:23:03.409781 543705 cpu.go:275] no items to output this cycle
E0319 18:23:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:23:13.409793 543705 memory.go:191] Add success.
I0319 18:23:13.409796 543705 cpu.go:282] Add success.
W0319 18:23:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:23:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:23:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:23:13.420328 543705 net.go:648] Add success.
I0319 18:23:13.423035 543705 net.go:770] primary dev: ETH0
I0319 18:23:13.423051 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:23:13.423065 543705 net.go:698] Add success.
I0319 18:23:14.454948 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:23:14.455105 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:23:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0319 18:23:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:23:14.456581 543705 disk_worker.go:494] system disk:vda1
I0319 18:23:14.456610 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:23:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:23:16.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:23:16.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:23:16.458080 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:23:16.472517 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:23:20.434873 543705 disk_info.go:125] begin check local disk info of client
I0319 18:23:20.437387 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:23:20.437393 543705 disk_info.go:196] parse disk info done, disk is : [0xc00051d240 0xc00051d280]
E0319 18:23:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:23:23.409810 543705 memory.go:184] no items to output this cycle
I0319 18:23:23.409819 543705 cpu.go:275] no items to output this cycle
E0319 18:23:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:23:33.409779 543705 memory.go:184] no items to output this cycle
I0319 18:23:33.409784 543705 cpu.go:275] no items to output this cycle
E0319 18:23:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:23:43.409790 543705 memory.go:191] Add success.
I0319 18:23:43.409791 543705 cpu.go:282] Add success.
I0319 18:23:43.419867 543705 net.go:648] Add success.
I0319 18:23:43.422745 543705 net.go:770] primary dev: ETH0
I0319 18:23:43.422758 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:23:43.422771 543705 net.go:698] Add success.
I0319 18:23:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:23:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:23:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:23:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:23:53.409804 543705 memory.go:184] no items to output this cycle
I0319 18:23:53.409815 543705 cpu.go:275] no items to output this cycle
E0319 18:24:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:24:03.409770 543705 memory.go:184] no items to output this cycle
I0319 18:24:03.409796 543705 cpu.go:275] no items to output this cycle
E0319 18:24:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:24:13.409810 543705 memory.go:191] Add success.
I0319 18:24:13.409819 543705 cpu.go:282] Add success.
W0319 18:24:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:24:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:24:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:24:13.420335 543705 net.go:648] Add success.
I0319 18:24:13.423394 543705 net.go:770] primary dev: ETH0
I0319 18:24:13.423407 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:24:13.423418 543705 net.go:698] Add success.
I0319 18:24:13.470680 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2c14d1fa-fd2c-4b37-bbc5-06ec4a8a2137","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 18:24:13.470710 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 18:24:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:24:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:24:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0319 18:24:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:24:14.456604 543705 disk_worker.go:494] system disk:vda1
I0319 18:24:14.456634 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:24:15.455975 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:24:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:24:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:24:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:24:16.472421 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:24:20.437882 543705 disk_info.go:125] begin check local disk info of client
I0319 18:24:20.440412 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:24:20.440418 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a63c0 0xc0004a6400]
E0319 18:24:23.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:24:23.409813 543705 memory.go:184] no items to output this cycle
I0319 18:24:23.409824 543705 cpu.go:275] no items to output this cycle
E0319 18:24:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:24:33.409778 543705 memory.go:184] no items to output this cycle
I0319 18:24:33.409782 543705 cpu.go:275] no items to output this cycle
I0319 18:24:37.805729 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 18:24:37.805734 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 18:24:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:24:43.410722 543705 memory.go:191] Add success.
I0319 18:24:43.409806 543705 cpu.go:282] Add success.
I0319 18:24:43.420415 543705 net.go:648] Add success.
I0319 18:24:43.423275 543705 net.go:770] primary dev: ETH0
I0319 18:24:43.423290 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:24:43.423304 543705 net.go:698] Add success.
I0319 18:24:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:24:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:24:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:24:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:24:53.409807 543705 memory.go:184] no items to output this cycle
I0319 18:24:53.409818 543705 cpu.go:275] no items to output this cycle
E0319 18:25:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:25:03.409765 543705 memory.go:184] no items to output this cycle
I0319 18:25:03.409801 543705 cpu.go:275] no items to output this cycle
E0319 18:25:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:25:13.409797 543705 memory.go:191] Add success.
I0319 18:25:13.409816 543705 cpu.go:282] Add success.
W0319 18:25:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:25:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:25:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:25:13.419709 543705 net.go:648] Add success.
I0319 18:25:13.422561 543705 net.go:770] primary dev: ETH0
I0319 18:25:13.422578 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:25:13.422595 543705 net.go:698] Add success.
I0319 18:25:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:25:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:25:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0319 18:25:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:25:14.456553 543705 disk_worker.go:494] system disk:vda1
I0319 18:25:14.456602 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:25:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:25:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:25:16.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:25:16.458078 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:25:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:25:20.440892 543705 disk_info.go:125] begin check local disk info of client
I0319 18:25:20.443388 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:25:20.443396 543705 disk_info.go:196] parse disk info done, disk is : [0xc000575cc0 0xc000575d00]
E0319 18:25:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:25:23.409781 543705 memory.go:184] no items to output this cycle
I0319 18:25:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 18:25:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:25:33.409781 543705 memory.go:184] no items to output this cycle
I0319 18:25:33.409787 543705 cpu.go:275] no items to output this cycle
E0319 18:25:43.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:25:43.409773 543705 memory.go:191] Add success.
I0319 18:25:43.409801 543705 cpu.go:282] Add success.
I0319 18:25:43.419906 543705 net.go:648] Add success.
I0319 18:25:43.422898 543705 net.go:770] primary dev: ETH0
I0319 18:25:43.422911 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:25:43.422924 543705 net.go:698] Add success.
I0319 18:25:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:25:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:25:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:25:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:25:53.409786 543705 memory.go:184] no items to output this cycle
I0319 18:25:53.409789 543705 cpu.go:275] no items to output this cycle
E0319 18:26:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:26:03.409767 543705 memory.go:184] no items to output this cycle
I0319 18:26:03.409807 543705 cpu.go:275] no items to output this cycle
E0319 18:26:13.409803 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:26:13.409812 543705 cpu.go:282] Add success.
I0319 18:26:13.409833 543705 memory.go:191] Add success.
W0319 18:26:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:26:13.409898 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:26:13.409903 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:26:13.419888 543705 net.go:648] Add success.
I0319 18:26:13.423114 543705 net.go:770] primary dev: ETH0
I0319 18:26:13.423134 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:26:13.423153 543705 net.go:698] Add success.
I0319 18:26:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:26:14.455205 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:26:14.455215 543705 disk_worker.go:708] disk space is not compliant
W0319 18:26:14.455218 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:26:14.456616 543705 disk_worker.go:494] system disk:vda1
I0319 18:26:14.456646 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:26:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:26:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:26:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:26:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:26:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:26:20.443924 543705 disk_info.go:125] begin check local disk info of client
I0319 18:26:20.446375 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:26:20.446382 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035e3c0 0xc00035e400]
E0319 18:26:23.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:26:23.409811 543705 memory.go:184] no items to output this cycle
I0319 18:26:23.409824 543705 cpu.go:275] no items to output this cycle
E0319 18:26:33.410170 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:26:33.410188 543705 memory.go:184] no items to output this cycle
I0319 18:26:33.410255 543705 cpu.go:275] no items to output this cycle
E0319 18:26:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:26:43.409815 543705 memory.go:191] Add success.
I0319 18:26:43.409823 543705 cpu.go:282] Add success.
I0319 18:26:43.419995 543705 net.go:648] Add success.
I0319 18:26:43.423053 543705 net.go:770] primary dev: ETH0
I0319 18:26:43.423068 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:26:43.423089 543705 net.go:698] Add success.
I0319 18:26:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:26:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:26:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:26:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:26:53.409809 543705 memory.go:184] no items to output this cycle
I0319 18:26:53.409818 543705 cpu.go:275] no items to output this cycle
E0319 18:27:03.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:27:03.409764 543705 memory.go:184] no items to output this cycle
I0319 18:27:03.409804 543705 cpu.go:275] no items to output this cycle
E0319 18:27:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:27:13.409795 543705 memory.go:191] Add success.
I0319 18:27:13.409796 543705 cpu.go:282] Add success.
W0319 18:27:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:27:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:27:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:27:13.420129 543705 net.go:648] Add success.
I0319 18:27:13.428576 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 18:27:13.428644 543705 net.go:770] primary dev: ETH0
I0319 18:27:13.428657 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:27:13.428670 543705 net.go:698] Add success.
I0319 18:27:13.453249 543705 event_worker.go:152] Polling the log file for events...
I0319 18:27:13.463810 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d639cc80-2e79-48fa-b940-62b397c95fff","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 18:27:13.463841 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 18:27:14.455129 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:27:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0319 18:27:14.455193 543705 disk_worker.go:728] disk inode is not compliant
E0319 18:27:14.456833 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 18:27:14.456842 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 18:27:14.456848 543705 custom_config.go:64] query custom config with name: gpu
I0319 18:27:14.456880 543705 disk_worker.go:494] system disk:vda1
I0319 18:27:14.456908 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 18:27:15.456809 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 18:27:15.456818 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:27:16.457901 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 18:27:16.457900 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 18:27:16.457955 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:27:16.457975 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:27:16.472290 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:27:20.446924 543705 disk_info.go:125] begin check local disk info of client
I0319 18:27:20.449303 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:27:20.449309 543705 disk_info.go:196] parse disk info done, disk is : [0xc000397300 0xc000397340]
E0319 18:27:23.410205 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:27:23.410224 543705 memory.go:184] no items to output this cycle
I0319 18:27:23.410240 543705 cpu.go:275] no items to output this cycle
E0319 18:27:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:27:33.409764 543705 memory.go:184] no items to output this cycle
I0319 18:27:33.409805 543705 cpu.go:275] no items to output this cycle
I0319 18:27:37.805866 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 18:27:37.805873 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 18:27:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:27:43.410837 543705 memory.go:191] Add success.
I0319 18:27:43.409809 543705 cpu.go:282] Add success.
I0319 18:27:43.420585 543705 net.go:648] Add success.
I0319 18:27:43.423767 543705 net.go:770] primary dev: ETH0
I0319 18:27:43.423781 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:27:43.423793 543705 net.go:698] Add success.
I0319 18:27:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:27:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:27:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:27:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:27:53.409806 543705 memory.go:184] no items to output this cycle
I0319 18:27:53.409815 543705 cpu.go:275] no items to output this cycle
E0319 18:28:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:28:03.409784 543705 cpu.go:275] no items to output this cycle
I0319 18:28:03.409789 543705 memory.go:184] no items to output this cycle
E0319 18:28:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:28:13.409786 543705 memory.go:191] Add success.
I0319 18:28:13.409807 543705 cpu.go:282] Add success.
W0319 18:28:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:28:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:28:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:28:13.420053 543705 net.go:648] Add success.
I0319 18:28:13.422863 543705 net.go:770] primary dev: ETH0
I0319 18:28:13.422876 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:28:13.422889 543705 net.go:698] Add success.
I0319 18:28:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:28:14.455140 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:28:14.455151 543705 disk_worker.go:708] disk space is not compliant
W0319 18:28:14.455154 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:28:14.456468 543705 disk_worker.go:494] system disk:vda1
I0319 18:28:14.456511 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:28:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:28:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:28:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:28:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:28:16.472412 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:28:20.449942 543705 disk_info.go:125] begin check local disk info of client
I0319 18:28:20.452446 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:28:20.452452 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003761c0 0xc000376200]
E0319 18:28:23.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:28:23.409814 543705 memory.go:184] no items to output this cycle
I0319 18:28:23.409827 543705 cpu.go:275] no items to output this cycle
E0319 18:28:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:28:33.409786 543705 memory.go:184] no items to output this cycle
I0319 18:28:33.409787 543705 cpu.go:275] no items to output this cycle
E0319 18:28:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:28:43.409821 543705 memory.go:191] Add success.
I0319 18:28:43.409830 543705 cpu.go:282] Add success.
I0319 18:28:43.419965 543705 net.go:648] Add success.
I0319 18:28:43.422708 543705 net.go:770] primary dev: ETH0
I0319 18:28:43.422720 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:28:43.422733 543705 net.go:698] Add success.
I0319 18:28:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:28:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:28:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:28:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:28:53.409799 543705 memory.go:184] no items to output this cycle
I0319 18:28:53.409802 543705 cpu.go:275] no items to output this cycle
E0319 18:29:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:29:03.409806 543705 memory.go:184] no items to output this cycle
I0319 18:29:03.409821 543705 cpu.go:275] no items to output this cycle
E0319 18:29:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:29:13.409821 543705 memory.go:191] Add success.
I0319 18:29:13.409832 543705 cpu.go:282] Add success.
W0319 18:29:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:29:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:29:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:29:13.420169 543705 net.go:648] Add success.
I0319 18:29:13.422939 543705 net.go:770] primary dev: ETH0
I0319 18:29:13.422955 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:29:13.423165 543705 net.go:698] Add success.
I0319 18:29:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:29:14.455146 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:29:14.455159 543705 disk_worker.go:708] disk space is not compliant
W0319 18:29:14.455162 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:29:14.456488 543705 disk_worker.go:494] system disk:vda1
I0319 18:29:14.456534 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:29:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:29:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:29:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:29:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:29:16.472379 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:29:20.452947 543705 disk_info.go:125] begin check local disk info of client
I0319 18:29:20.455362 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:29:20.455367 543705 disk_info.go:196] parse disk info done, disk is : [0xc000535a00 0xc000535a40]
E0319 18:29:23.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:29:23.409817 543705 memory.go:184] no items to output this cycle
I0319 18:29:23.409828 543705 cpu.go:275] no items to output this cycle
E0319 18:29:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:29:33.409800 543705 memory.go:184] no items to output this cycle
I0319 18:29:33.409812 543705 cpu.go:275] no items to output this cycle
E0319 18:29:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:29:43.409798 543705 memory.go:191] Add success.
I0319 18:29:43.409816 543705 cpu.go:282] Add success.
I0319 18:29:43.419873 543705 net.go:648] Add success.
I0319 18:29:43.422428 543705 net.go:770] primary dev: ETH0
I0319 18:29:43.422441 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:29:43.422454 543705 net.go:698] Add success.
I0319 18:29:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:29:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:29:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:29:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:29:53.409792 543705 memory.go:184] no items to output this cycle
I0319 18:29:53.409823 543705 cpu.go:275] no items to output this cycle
E0319 18:30:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:30:03.409781 543705 memory.go:184] no items to output this cycle
I0319 18:30:03.409806 543705 cpu.go:275] no items to output this cycle
E0319 18:30:13.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:30:13.409833 543705 memory.go:191] Add success.
I0319 18:30:13.409844 543705 cpu.go:282] Add success.
W0319 18:30:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:30:13.409882 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:30:13.409886 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:30:13.420130 543705 net.go:648] Add success.
I0319 18:30:13.422885 543705 net.go:770] primary dev: ETH0
I0319 18:30:13.422898 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:30:13.422911 543705 net.go:698] Add success.
I0319 18:30:13.471204 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c667f95c-0c65-486a-84b6-bd68ef877a06","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 18:30:13.471237 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 18:30:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:30:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:30:14.455160 543705 disk_worker.go:708] disk space is not compliant
W0319 18:30:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:30:14.456498 543705 disk_worker.go:494] system disk:vda1
I0319 18:30:14.456541 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:30:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:30:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:30:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:30:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:30:16.472414 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:30:20.455975 543705 disk_info.go:125] begin check local disk info of client
I0319 18:30:20.458430 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:30:20.458436 543705 disk_info.go:196] parse disk info done, disk is : [0xc000509440 0xc000509480]
E0319 18:30:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:30:23.409780 543705 memory.go:184] no items to output this cycle
I0319 18:30:23.409818 543705 cpu.go:275] no items to output this cycle
E0319 18:30:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:30:33.409797 543705 memory.go:184] no items to output this cycle
I0319 18:30:33.409810 543705 cpu.go:275] no items to output this cycle
I0319 18:30:37.807738 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 18:30:37.807745 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 18:30:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:30:43.410661 543705 memory.go:191] Add success.
I0319 18:30:43.409804 543705 cpu.go:282] Add success.
I0319 18:30:43.420348 543705 net.go:648] Add success.
I0319 18:30:43.422802 543705 net.go:770] primary dev: ETH0
I0319 18:30:43.422815 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:30:43.422828 543705 net.go:698] Add success.
I0319 18:30:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:30:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:30:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:30:53.410228 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:30:53.410247 543705 memory.go:184] no items to output this cycle
I0319 18:30:53.410281 543705 cpu.go:275] no items to output this cycle
E0319 18:31:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:31:03.409766 543705 memory.go:184] no items to output this cycle
I0319 18:31:03.409804 543705 cpu.go:275] no items to output this cycle
E0319 18:31:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:31:13.409792 543705 memory.go:191] Add success.
I0319 18:31:13.409816 543705 cpu.go:282] Add success.
W0319 18:31:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:31:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:31:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:31:13.420158 543705 net.go:770] primary dev: ETH0
I0319 18:31:13.420172 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:31:13.420186 543705 net.go:698] Add success.
I0319 18:31:13.420533 543705 net.go:648] Add success.
I0319 18:31:14.454980 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:31:14.455118 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:31:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0319 18:31:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:31:14.457032 543705 disk_worker.go:494] system disk:vda1
I0319 18:31:14.457075 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:31:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:31:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:31:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:31:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:31:16.472364 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:31:20.458990 543705 disk_info.go:125] begin check local disk info of client
I0319 18:31:20.461347 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:31:20.461354 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007ab80 0xc00007abc0]
E0319 18:31:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:31:23.409773 543705 memory.go:184] no items to output this cycle
I0319 18:31:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 18:31:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:31:33.409784 543705 cpu.go:275] no items to output this cycle
I0319 18:31:33.409791 543705 memory.go:184] no items to output this cycle
E0319 18:31:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:31:43.409792 543705 memory.go:191] Add success.
I0319 18:31:43.409794 543705 cpu.go:282] Add success.
I0319 18:31:43.419995 543705 net.go:648] Add success.
I0319 18:31:43.422813 543705 net.go:770] primary dev: ETH0
I0319 18:31:43.422827 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:31:43.422839 543705 net.go:698] Add success.
I0319 18:31:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:31:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:31:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:31:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:31:53.409772 543705 memory.go:184] no items to output this cycle
I0319 18:31:53.409800 543705 cpu.go:275] no items to output this cycle
E0319 18:32:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:32:03.409793 543705 memory.go:184] no items to output this cycle
I0319 18:32:03.409807 543705 cpu.go:275] no items to output this cycle
E0319 18:32:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:32:13.409814 543705 memory.go:191] Add success.
I0319 18:32:13.409824 543705 cpu.go:282] Add success.
W0319 18:32:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:32:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:32:13.409859 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:32:13.420519 543705 net.go:648] Add success.
I0319 18:32:13.423219 543705 net.go:770] primary dev: ETH0
I0319 18:32:13.423235 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:32:13.423247 543705 net.go:698] Add success.
W0319 18:32:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:32:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0319 18:32:14.455194 543705 disk_worker.go:728] disk inode is not compliant
E0319 18:32:14.456047 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 18:32:14.456057 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 18:32:14.456063 543705 custom_config.go:64] query custom config with name: gpu
I0319 18:32:14.456911 543705 disk_worker.go:494] system disk:vda1
I0319 18:32:14.456940 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 18:32:15.456826 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 18:32:15.456835 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:32:16.457910 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 18:32:16.457910 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 18:32:16.457963 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:32:16.457991 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:32:16.472351 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:32:20.462007 543705 disk_info.go:125] begin check local disk info of client
I0319 18:32:20.464439 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:32:20.464446 543705 disk_info.go:196] parse disk info done, disk is : [0xc000384300 0xc000384340]
E0319 18:32:23.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:32:23.409812 543705 memory.go:184] no items to output this cycle
I0319 18:32:23.409820 543705 cpu.go:275] no items to output this cycle
E0319 18:32:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:32:33.409776 543705 memory.go:184] no items to output this cycle
I0319 18:32:33.409783 543705 cpu.go:275] no items to output this cycle
E0319 18:32:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:32:43.409795 543705 memory.go:191] Add success.
I0319 18:32:43.409796 543705 cpu.go:282] Add success.
I0319 18:32:43.419978 543705 net.go:648] Add success.
I0319 18:32:43.422786 543705 net.go:770] primary dev: ETH0
I0319 18:32:43.422799 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:32:43.422813 543705 net.go:698] Add success.
I0319 18:32:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:32:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:32:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:32:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:32:53.409788 543705 memory.go:184] no items to output this cycle
I0319 18:32:53.409793 543705 cpu.go:275] no items to output this cycle
E0319 18:33:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:33:03.409779 543705 memory.go:184] no items to output this cycle
I0319 18:33:03.409790 543705 cpu.go:275] no items to output this cycle
E0319 18:33:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:33:13.409799 543705 memory.go:191] Add success.
I0319 18:33:13.409799 543705 cpu.go:282] Add success.
W0319 18:33:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:33:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:33:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:33:13.420291 543705 net.go:648] Add success.
I0319 18:33:13.423546 543705 net.go:770] primary dev: ETH0
I0319 18:33:13.423559 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:33:13.423571 543705 net.go:698] Add success.
I0319 18:33:13.469713 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"02870f2e-2b2f-470e-ab1e-4de4c8144180","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 18:33:13.469748 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 18:33:14.454981 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:33:14.455153 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:33:14.455222 543705 disk_worker.go:708] disk space is not compliant
W0319 18:33:14.455226 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:33:14.457473 543705 disk_worker.go:494] system disk:vda1
I0319 18:33:14.457512 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:33:15.456024 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:33:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:33:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:33:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:33:16.472418 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:33:20.465013 543705 disk_info.go:125] begin check local disk info of client
I0319 18:33:20.467529 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:33:20.467536 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004645c0 0xc000464600]
E0319 18:33:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:33:23.409789 543705 memory.go:184] no items to output this cycle
I0319 18:33:23.409795 543705 cpu.go:275] no items to output this cycle
E0319 18:33:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:33:33.409782 543705 memory.go:184] no items to output this cycle
I0319 18:33:33.409786 543705 cpu.go:275] no items to output this cycle
I0319 18:33:37.809732 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 18:33:37.809739 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 18:33:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:33:43.410670 543705 memory.go:191] Add success.
I0319 18:33:43.409799 543705 cpu.go:282] Add success.
I0319 18:33:43.420221 543705 net.go:770] primary dev: ETH0
I0319 18:33:43.420234 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:33:43.420247 543705 net.go:698] Add success.
I0319 18:33:43.420594 543705 net.go:648] Add success.
I0319 18:33:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:33:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:33:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:33:53.410378 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:33:53.410394 543705 memory.go:184] no items to output this cycle
I0319 18:33:53.410396 543705 cpu.go:275] no items to output this cycle
E0319 18:34:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:34:03.409767 543705 memory.go:184] no items to output this cycle
I0319 18:34:03.409789 543705 cpu.go:275] no items to output this cycle
E0319 18:34:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:34:13.409794 543705 memory.go:191] Add success.
I0319 18:34:13.409813 543705 cpu.go:282] Add success.
W0319 18:34:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:34:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:34:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:34:13.420107 543705 net.go:648] Add success.
I0319 18:34:13.422873 543705 net.go:770] primary dev: ETH0
I0319 18:34:13.422886 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:34:13.422898 543705 net.go:698] Add success.
I0319 18:34:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:34:14.455193 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:34:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0319 18:34:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:34:14.456822 543705 disk_worker.go:494] system disk:vda1
I0319 18:34:14.456851 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:34:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:34:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:34:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:34:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:34:16.472431 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:34:20.468026 543705 disk_info.go:125] begin check local disk info of client
I0319 18:34:20.470450 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:34:20.470456 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004b4280 0xc0004b42c0]
E0319 18:34:23.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:34:23.409812 543705 memory.go:184] no items to output this cycle
I0319 18:34:23.409820 543705 cpu.go:275] no items to output this cycle
E0319 18:34:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:34:33.409786 543705 memory.go:184] no items to output this cycle
I0319 18:34:33.409785 543705 cpu.go:275] no items to output this cycle
E0319 18:34:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:34:43.409792 543705 memory.go:191] Add success.
I0319 18:34:43.409793 543705 cpu.go:282] Add success.
I0319 18:34:43.419877 543705 net.go:648] Add success.
I0319 18:34:43.422686 543705 net.go:770] primary dev: ETH0
I0319 18:34:43.422699 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:34:43.422711 543705 net.go:698] Add success.
I0319 18:34:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:34:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:34:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:34:53.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:34:53.409821 543705 memory.go:184] no items to output this cycle
I0319 18:34:53.409829 543705 cpu.go:275] no items to output this cycle
E0319 18:35:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:35:03.409776 543705 memory.go:184] no items to output this cycle
I0319 18:35:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 18:35:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:35:13.409785 543705 memory.go:191] Add success.
I0319 18:35:13.409803 543705 cpu.go:282] Add success.
W0319 18:35:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:35:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:35:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:35:13.420149 543705 net.go:648] Add success.
I0319 18:35:13.423374 543705 net.go:770] primary dev: ETH0
I0319 18:35:13.423389 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:35:13.423402 543705 net.go:698] Add success.
I0319 18:35:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:35:14.455130 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:35:14.455215 543705 disk_worker.go:708] disk space is not compliant
W0319 18:35:14.455218 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:35:14.456582 543705 disk_worker.go:494] system disk:vda1
I0319 18:35:14.456614 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:35:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:35:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:35:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:35:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:35:16.472393 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:35:20.471046 543705 disk_info.go:125] begin check local disk info of client
I0319 18:35:20.473446 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:35:20.473452 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b2100 0xc0003b2140]
E0319 18:35:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:35:23.409780 543705 memory.go:184] no items to output this cycle
I0319 18:35:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 18:35:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:35:33.409782 543705 memory.go:184] no items to output this cycle
I0319 18:35:33.409787 543705 cpu.go:275] no items to output this cycle
E0319 18:35:43.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:35:43.409775 543705 memory.go:191] Add success.
I0319 18:35:43.409816 543705 cpu.go:282] Add success.
I0319 18:35:43.419967 543705 net.go:648] Add success.
I0319 18:35:43.422927 543705 net.go:770] primary dev: ETH0
I0319 18:35:43.422945 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:35:43.422961 543705 net.go:698] Add success.
I0319 18:35:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:35:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:35:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:35:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:35:53.409787 543705 memory.go:184] no items to output this cycle
I0319 18:35:53.409787 543705 cpu.go:275] no items to output this cycle
E0319 18:36:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:36:03.409767 543705 memory.go:184] no items to output this cycle
I0319 18:36:03.409800 543705 cpu.go:275] no items to output this cycle
E0319 18:36:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:36:13.409795 543705 cpu.go:282] Add success.
I0319 18:36:13.409796 543705 memory.go:191] Add success.
W0319 18:36:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:36:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:36:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:36:13.420054 543705 net.go:648] Add success.
I0319 18:36:13.422937 543705 net.go:770] primary dev: ETH0
I0319 18:36:13.422949 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:36:13.422961 543705 net.go:698] Add success.
I0319 18:36:13.464294 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d98836fa-a207-4fd0-85ca-f1216748338f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 18:36:13.464328 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 18:36:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:36:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:36:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0319 18:36:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:36:14.456676 543705 disk_worker.go:494] system disk:vda1
I0319 18:36:14.456705 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:36:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:36:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:36:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:36:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:36:16.472424 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:36:20.474058 543705 disk_info.go:125] begin check local disk info of client
I0319 18:36:20.476519 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:36:20.476526 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004d0700 0xc0004d0740]
E0319 18:36:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:36:23.409786 543705 cpu.go:275] no items to output this cycle
I0319 18:36:23.409796 543705 memory.go:184] no items to output this cycle
E0319 18:36:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:36:33.409799 543705 memory.go:184] no items to output this cycle
I0319 18:36:33.409815 543705 cpu.go:275] no items to output this cycle
I0319 18:36:37.809879 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 18:36:37.809885 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 18:36:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:36:43.410848 543705 memory.go:191] Add success.
I0319 18:36:43.409811 543705 cpu.go:282] Add success.
I0319 18:36:43.420596 543705 net.go:648] Add success.
I0319 18:36:43.423112 543705 net.go:770] primary dev: ETH0
I0319 18:36:43.423126 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:36:43.423141 543705 net.go:698] Add success.
I0319 18:36:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:36:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:36:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:36:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:36:53.409788 543705 memory.go:184] no items to output this cycle
I0319 18:36:53.409789 543705 cpu.go:275] no items to output this cycle
E0319 18:37:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:37:03.409802 543705 memory.go:184] no items to output this cycle
I0319 18:37:03.409818 543705 cpu.go:275] no items to output this cycle
E0319 18:37:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:37:13.409786 543705 memory.go:191] Add success.
I0319 18:37:13.409807 543705 cpu.go:282] Add success.
W0319 18:37:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:37:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:37:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:37:13.420132 543705 net.go:648] Add success.
I0319 18:37:13.422824 543705 net.go:770] primary dev: ETH0
I0319 18:37:13.422839 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:37:13.422850 543705 net.go:698] Add success.
I0319 18:37:13.453401 543705 event_worker.go:152] Polling the log file for events...
W0319 18:37:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:37:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0319 18:37:14.455189 543705 disk_worker.go:728] disk inode is not compliant
E0319 18:37:14.455916 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 18:37:14.455925 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 18:37:14.455931 543705 custom_config.go:64] query custom config with name: gpu
I0319 18:37:14.456565 543705 disk_worker.go:494] system disk:vda1
I0319 18:37:14.456598 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 18:37:15.456816 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 18:37:15.456826 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:37:16.457955 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 18:37:16.457966 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 18:37:16.458008 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:37:16.458024 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:37:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:37:20.477073 543705 disk_info.go:125] begin check local disk info of client
I0319 18:37:20.479477 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:37:20.479483 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003440c0 0xc000344100]
E0319 18:37:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:37:23.409812 543705 memory.go:184] no items to output this cycle
I0319 18:37:23.409818 543705 cpu.go:275] no items to output this cycle
E0319 18:37:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:37:33.409779 543705 cpu.go:275] no items to output this cycle
I0319 18:37:33.409785 543705 memory.go:184] no items to output this cycle
E0319 18:37:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:37:43.409809 543705 memory.go:191] Add success.
I0319 18:37:43.409818 543705 cpu.go:282] Add success.
I0319 18:37:43.419944 543705 net.go:648] Add success.
I0319 18:37:43.422867 543705 net.go:770] primary dev: ETH0
I0319 18:37:43.422882 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:37:43.422896 543705 net.go:698] Add success.
I0319 18:37:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:37:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:37:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:37:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:37:53.409782 543705 memory.go:184] no items to output this cycle
I0319 18:37:53.409784 543705 cpu.go:275] no items to output this cycle
E0319 18:38:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:38:03.409776 543705 memory.go:184] no items to output this cycle
I0319 18:38:03.409800 543705 cpu.go:275] no items to output this cycle
E0319 18:38:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:38:13.409792 543705 memory.go:191] Add success.
I0319 18:38:13.409798 543705 cpu.go:282] Add success.
W0319 18:38:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:38:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:38:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:38:13.420142 543705 net.go:648] Add success.
I0319 18:38:13.422997 543705 net.go:770] primary dev: ETH0
I0319 18:38:13.423011 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:38:13.423023 543705 net.go:698] Add success.
I0319 18:38:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:38:14.455194 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:38:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0319 18:38:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:38:14.456606 543705 disk_worker.go:494] system disk:vda1
I0319 18:38:14.456636 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:38:15.455977 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:38:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:38:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:38:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:38:16.472455 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:38:20.480085 543705 disk_info.go:125] begin check local disk info of client
I0319 18:38:20.482911 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:38:20.482919 543705 disk_info.go:196] parse disk info done, disk is : [0xc00049e000 0xc00049e040]
E0319 18:38:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:38:23.409789 543705 memory.go:184] no items to output this cycle
I0319 18:38:23.409811 543705 cpu.go:275] no items to output this cycle
E0319 18:38:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:38:33.409775 543705 memory.go:184] no items to output this cycle
I0319 18:38:33.409792 543705 cpu.go:275] no items to output this cycle
E0319 18:38:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:38:43.409807 543705 memory.go:191] Add success.
I0319 18:38:43.409815 543705 cpu.go:282] Add success.
I0319 18:38:43.419978 543705 net.go:648] Add success.
I0319 18:38:43.422600 543705 net.go:770] primary dev: ETH0
I0319 18:38:43.422615 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:38:43.422629 543705 net.go:698] Add success.
I0319 18:38:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:38:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:38:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:38:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:38:53.409775 543705 memory.go:184] no items to output this cycle
I0319 18:38:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 18:39:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:39:03.409777 543705 memory.go:184] no items to output this cycle
I0319 18:39:03.409787 543705 cpu.go:275] no items to output this cycle
E0319 18:39:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:39:13.409788 543705 cpu.go:282] Add success.
I0319 18:39:13.409795 543705 memory.go:191] Add success.
W0319 18:39:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:39:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:39:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:39:13.420056 543705 net.go:648] Add success.
I0319 18:39:13.422695 543705 net.go:770] primary dev: ETH0
I0319 18:39:13.422710 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:39:13.422724 543705 net.go:698] Add success.
I0319 18:39:13.467736 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"241bfd0f-cd78-48b8-a1bf-96e3a8e23954","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 18:39:13.467769 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 18:39:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:39:14.455133 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:39:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0319 18:39:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:39:14.456601 543705 disk_worker.go:494] system disk:vda1
I0319 18:39:14.456632 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:39:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:39:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:39:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:39:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:39:16.472406 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:39:20.483060 543705 disk_info.go:125] begin check local disk info of client
I0319 18:39:20.485727 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:39:20.485733 543705 disk_info.go:196] parse disk info done, disk is : [0xc000296040 0xc000296080]
E0319 18:39:23.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:39:23.409814 543705 memory.go:184] no items to output this cycle
I0319 18:39:23.409818 543705 cpu.go:275] no items to output this cycle
E0319 18:39:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:39:33.409782 543705 memory.go:184] no items to output this cycle
I0319 18:39:33.409791 543705 cpu.go:275] no items to output this cycle
I0319 18:39:37.810025 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 18:39:37.810032 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 18:39:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:39:43.410639 543705 memory.go:191] Add success.
I0319 18:39:43.409828 543705 cpu.go:282] Add success.
I0319 18:39:43.420324 543705 net.go:648] Add success.
I0319 18:39:43.423148 543705 net.go:770] primary dev: ETH0
I0319 18:39:43.423161 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:39:43.423173 543705 net.go:698] Add success.
I0319 18:39:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:39:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:39:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:39:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:39:53.409804 543705 memory.go:184] no items to output this cycle
I0319 18:39:53.409818 543705 cpu.go:275] no items to output this cycle
E0319 18:40:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:40:03.409771 543705 memory.go:184] no items to output this cycle
I0319 18:40:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 18:40:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:40:13.409812 543705 memory.go:191] Add success.
I0319 18:40:13.409819 543705 cpu.go:282] Add success.
W0319 18:40:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:40:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:40:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:40:13.420197 543705 net.go:648] Add success.
I0319 18:40:13.423155 543705 net.go:770] primary dev: ETH0
I0319 18:40:13.423169 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:40:13.423181 543705 net.go:698] Add success.
I0319 18:40:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:40:14.455100 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:40:14.455160 543705 disk_worker.go:708] disk space is not compliant
W0319 18:40:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:40:14.456499 543705 disk_worker.go:494] system disk:vda1
I0319 18:40:14.456542 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:40:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:40:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:40:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:40:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:40:16.472397 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:40:20.486073 543705 disk_info.go:125] begin check local disk info of client
I0319 18:40:20.488515 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:40:20.488521 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b20c0 0xc0003b2100]
E0319 18:40:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:40:23.409780 543705 memory.go:184] no items to output this cycle
I0319 18:40:23.409811 543705 cpu.go:275] no items to output this cycle
E0319 18:40:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:40:33.409798 543705 memory.go:184] no items to output this cycle
I0319 18:40:33.409814 543705 cpu.go:275] no items to output this cycle
E0319 18:40:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:40:43.409782 543705 memory.go:191] Add success.
I0319 18:40:43.409808 543705 cpu.go:282] Add success.
I0319 18:40:43.419892 543705 net.go:648] Add success.
I0319 18:40:43.422692 543705 net.go:770] primary dev: ETH0
I0319 18:40:43.422707 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:40:43.422721 543705 net.go:698] Add success.
I0319 18:40:46.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:40:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:40:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:40:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:40:53.409787 543705 memory.go:184] no items to output this cycle
I0319 18:40:53.409792 543705 cpu.go:275] no items to output this cycle
E0319 18:41:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:41:03.409802 543705 memory.go:184] no items to output this cycle
I0319 18:41:03.409817 543705 cpu.go:275] no items to output this cycle
E0319 18:41:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:41:13.409808 543705 memory.go:191] Add success.
I0319 18:41:13.409817 543705 cpu.go:282] Add success.
W0319 18:41:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:41:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:41:13.409860 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:41:13.420132 543705 net.go:648] Add success.
I0319 18:41:13.422955 543705 net.go:770] primary dev: ETH0
I0319 18:41:13.422969 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:41:13.422983 543705 net.go:698] Add success.
I0319 18:41:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:41:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:41:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 18:41:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:41:14.456568 543705 disk_worker.go:494] system disk:vda1
I0319 18:41:14.456600 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:41:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:41:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:41:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:41:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:41:16.472393 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:41:20.489120 543705 disk_info.go:125] begin check local disk info of client
I0319 18:41:20.491618 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:41:20.491623 543705 disk_info.go:196] parse disk info done, disk is : [0xc000253980 0xc0002539c0]
I0319 18:41:23.409944 543705 cpu.go:275] no items to output this cycle
E0319 18:41:23.410015 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:41:23.410033 543705 memory.go:184] no items to output this cycle
E0319 18:41:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:41:33.409774 543705 memory.go:184] no items to output this cycle
I0319 18:41:33.409808 543705 cpu.go:275] no items to output this cycle
E0319 18:41:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:41:43.409794 543705 memory.go:191] Add success.
I0319 18:41:43.409805 543705 cpu.go:282] Add success.
I0319 18:41:43.419909 543705 net.go:648] Add success.
I0319 18:41:43.422538 543705 net.go:770] primary dev: ETH0
I0319 18:41:43.422551 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:41:43.422563 543705 net.go:698] Add success.
I0319 18:41:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:41:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:41:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:41:53.410208 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:41:53.410225 543705 memory.go:184] no items to output this cycle
I0319 18:41:53.410253 543705 cpu.go:275] no items to output this cycle
E0319 18:42:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:42:03.409782 543705 memory.go:184] no items to output this cycle
I0319 18:42:03.409814 543705 cpu.go:275] no items to output this cycle
E0319 18:42:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:42:13.409798 543705 memory.go:191] Add success.
I0319 18:42:13.409800 543705 cpu.go:282] Add success.
W0319 18:42:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:42:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:42:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:42:13.420042 543705 net.go:648] Add success.
I0319 18:42:13.422759 543705 net.go:770] primary dev: ETH0
I0319 18:42:13.422772 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:42:13.422784 543705 net.go:698] Add success.
I0319 18:42:13.464258 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5dce0ac3-f385-49c8-acb4-49b2e5e37360","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 18:42:13.464292 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 18:42:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:42:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0319 18:42:14.455174 543705 disk_worker.go:728] disk inode is not compliant
E0319 18:42:14.456956 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 18:42:14.456965 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 18:42:14.456970 543705 custom_config.go:64] query custom config with name: gpu
I0319 18:42:14.457020 543705 disk_worker.go:494] system disk:vda1
I0319 18:42:14.457049 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 18:42:15.456827 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 18:42:15.456837 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:42:16.457930 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 18:42:16.457930 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 18:42:16.457983 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:42:16.458002 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:42:16.472341 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:42:20.492168 543705 disk_info.go:125] begin check local disk info of client
I0319 18:42:20.494592 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:42:20.494598 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003c05c0 0xc0003c0600]
E0319 18:42:23.409802 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:42:23.409826 543705 memory.go:184] no items to output this cycle
I0319 18:42:23.409947 543705 cpu.go:275] no items to output this cycle
E0319 18:42:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:42:33.409788 543705 memory.go:184] no items to output this cycle
I0319 18:42:33.409795 543705 cpu.go:275] no items to output this cycle
I0319 18:42:37.810169 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 18:42:37.810175 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 18:42:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:42:43.410659 543705 memory.go:191] Add success.
I0319 18:42:43.409831 543705 cpu.go:282] Add success.
I0319 18:42:43.420445 543705 net.go:648] Add success.
I0319 18:42:43.423088 543705 net.go:770] primary dev: ETH0
I0319 18:42:43.423101 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:42:43.423113 543705 net.go:698] Add success.
I0319 18:42:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:42:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:42:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:42:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:42:53.409795 543705 memory.go:184] no items to output this cycle
I0319 18:42:53.409798 543705 cpu.go:275] no items to output this cycle
E0319 18:43:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:43:03.409781 543705 memory.go:184] no items to output this cycle
I0319 18:43:03.409820 543705 cpu.go:275] no items to output this cycle
E0319 18:43:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:43:13.409827 543705 memory.go:191] Add success.
I0319 18:43:13.409838 543705 cpu.go:282] Add success.
W0319 18:43:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:43:13.409875 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:43:13.409879 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:43:13.420154 543705 net.go:648] Add success.
I0319 18:43:13.422928 543705 net.go:770] primary dev: ETH0
I0319 18:43:13.422942 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:43:13.422954 543705 net.go:698] Add success.
I0319 18:43:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:43:14.455128 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:43:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0319 18:43:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:43:14.456571 543705 disk_worker.go:494] system disk:vda1
I0319 18:43:14.456603 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:43:15.455974 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:43:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:43:16.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:43:16.458087 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:43:16.472441 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:43:20.495166 543705 disk_info.go:125] begin check local disk info of client
I0319 18:43:20.497826 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:43:20.497832 543705 disk_info.go:196] parse disk info done, disk is : [0xc000263c00 0xc000263c40]
E0319 18:43:23.409812 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:43:23.409834 543705 memory.go:184] no items to output this cycle
I0319 18:43:23.409842 543705 cpu.go:275] no items to output this cycle
E0319 18:43:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:43:33.409816 543705 memory.go:184] no items to output this cycle
I0319 18:43:33.409827 543705 cpu.go:275] no items to output this cycle
E0319 18:43:43.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:43:43.409775 543705 memory.go:191] Add success.
I0319 18:43:43.409805 543705 cpu.go:282] Add success.
I0319 18:43:43.419885 543705 net.go:648] Add success.
I0319 18:43:43.422936 543705 net.go:770] primary dev: ETH0
I0319 18:43:43.422949 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:43:43.422961 543705 net.go:698] Add success.
I0319 18:43:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:43:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:43:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:43:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:43:53.409783 543705 cpu.go:275] no items to output this cycle
I0319 18:43:53.409794 543705 memory.go:184] no items to output this cycle
E0319 18:44:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:44:03.409780 543705 memory.go:184] no items to output this cycle
I0319 18:44:03.409786 543705 cpu.go:275] no items to output this cycle
E0319 18:44:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:44:13.409807 543705 memory.go:191] Add success.
I0319 18:44:13.409812 543705 cpu.go:282] Add success.
W0319 18:44:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:44:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:44:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:44:13.420123 543705 net.go:648] Add success.
I0319 18:44:13.422695 543705 net.go:770] primary dev: ETH0
I0319 18:44:13.422709 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:44:13.422720 543705 net.go:698] Add success.
I0319 18:44:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:44:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:44:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0319 18:44:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:44:14.456580 543705 disk_worker.go:494] system disk:vda1
I0319 18:44:14.456613 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:44:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:44:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:44:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:44:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:44:16.472383 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:44:20.497912 543705 disk_info.go:125] begin check local disk info of client
I0319 18:44:20.500417 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:44:20.500424 543705 disk_info.go:196] parse disk info done, disk is : [0xc00036b200 0xc00036b240]
E0319 18:44:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:44:23.409775 543705 memory.go:184] no items to output this cycle
I0319 18:44:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 18:44:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:44:33.409804 543705 memory.go:184] no items to output this cycle
I0319 18:44:33.409817 543705 cpu.go:275] no items to output this cycle
E0319 18:44:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:44:43.409808 543705 memory.go:191] Add success.
I0319 18:44:43.409817 543705 cpu.go:282] Add success.
I0319 18:44:43.420006 543705 net.go:648] Add success.
I0319 18:44:43.422774 543705 net.go:770] primary dev: ETH0
I0319 18:44:43.422787 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:44:43.422799 543705 net.go:698] Add success.
I0319 18:44:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:44:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:44:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:44:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:44:53.409779 543705 memory.go:184] no items to output this cycle
I0319 18:44:53.409794 543705 cpu.go:275] no items to output this cycle
E0319 18:45:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:45:03.409786 543705 memory.go:184] no items to output this cycle
I0319 18:45:03.409790 543705 cpu.go:275] no items to output this cycle
I0319 18:45:13.409788 543705 cpu.go:282] Add success.
E0319 18:45:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:45:13.409809 543705 memory.go:191] Add success.
W0319 18:45:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:45:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:45:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:45:13.420051 543705 net.go:648] Add success.
I0319 18:45:13.423089 543705 net.go:770] primary dev: ETH0
I0319 18:45:13.423102 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:45:13.423115 543705 net.go:698] Add success.
I0319 18:45:13.470292 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"67db016b-4c18-45df-8afc-69c8402dae16","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 18:45:13.470325 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 18:45:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:45:14.455168 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:45:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0319 18:45:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:45:14.456514 543705 disk_worker.go:494] system disk:vda1
I0319 18:45:14.456560 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:45:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:45:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:45:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:45:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:45:16.472374 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:45:20.501190 543705 disk_info.go:125] begin check local disk info of client
I0319 18:45:20.503603 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:45:20.503609 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a5380 0xc0002a53c0]
E0319 18:45:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:45:23.409807 543705 memory.go:184] no items to output this cycle
I0319 18:45:23.409815 543705 cpu.go:275] no items to output this cycle
E0319 18:45:33.410651 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:45:33.410672 543705 memory.go:184] no items to output this cycle
I0319 18:45:33.410688 543705 cpu.go:275] no items to output this cycle
I0319 18:45:37.811761 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 18:45:37.811768 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 18:45:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:45:43.410598 543705 memory.go:191] Add success.
I0319 18:45:43.409788 543705 cpu.go:282] Add success.
I0319 18:45:43.420309 543705 net.go:648] Add success.
I0319 18:45:43.422714 543705 net.go:770] primary dev: ETH0
I0319 18:45:43.422732 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:45:43.422747 543705 net.go:698] Add success.
I0319 18:45:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:45:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:45:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:45:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:45:53.409792 543705 memory.go:184] no items to output this cycle
I0319 18:45:53.409805 543705 cpu.go:275] no items to output this cycle
E0319 18:46:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:46:03.409786 543705 memory.go:184] no items to output this cycle
I0319 18:46:03.409786 543705 cpu.go:275] no items to output this cycle
E0319 18:46:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:46:13.409785 543705 memory.go:191] Add success.
W0319 18:46:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 18:46:13.409814 543705 cpu.go:282] Add success.
W0319 18:46:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:46:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:46:13.420097 543705 net.go:648] Add success.
I0319 18:46:13.422905 543705 net.go:770] primary dev: ETH0
I0319 18:46:13.422918 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:46:13.422930 543705 net.go:698] Add success.
I0319 18:46:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:46:14.455201 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:46:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0319 18:46:14.455214 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:46:14.456590 543705 disk_worker.go:494] system disk:vda1
I0319 18:46:14.456622 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:46:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:46:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:46:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:46:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:46:16.472376 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:46:20.504210 543705 disk_info.go:125] begin check local disk info of client
I0319 18:46:20.506556 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:46:20.506562 543705 disk_info.go:196] parse disk info done, disk is : [0xc000275480 0xc0002754c0]
E0319 18:46:23.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:46:23.409813 543705 memory.go:184] no items to output this cycle
I0319 18:46:23.409819 543705 cpu.go:275] no items to output this cycle
E0319 18:46:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:46:33.409766 543705 memory.go:184] no items to output this cycle
I0319 18:46:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 18:46:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:46:43.409815 543705 memory.go:191] Add success.
I0319 18:46:43.409825 543705 cpu.go:282] Add success.
I0319 18:46:43.420178 543705 net.go:648] Add success.
I0319 18:46:43.422826 543705 net.go:770] primary dev: ETH0
I0319 18:46:43.422839 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:46:43.422850 543705 net.go:698] Add success.
I0319 18:46:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:46:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:46:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:46:53.410417 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:46:53.410441 543705 memory.go:184] no items to output this cycle
I0319 18:46:53.410444 543705 cpu.go:275] no items to output this cycle
E0319 18:47:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:47:03.409766 543705 memory.go:184] no items to output this cycle
I0319 18:47:03.409800 543705 cpu.go:275] no items to output this cycle
E0319 18:47:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:47:13.409788 543705 memory.go:191] Add success.
I0319 18:47:13.409793 543705 cpu.go:282] Add success.
W0319 18:47:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:47:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:47:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:47:13.420090 543705 net.go:648] Add success.
I0319 18:47:13.422772 543705 net.go:770] primary dev: ETH0
I0319 18:47:13.422785 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:47:13.422798 543705 net.go:698] Add success.
I0319 18:47:13.453352 543705 event_worker.go:152] Polling the log file for events...
W0319 18:47:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:47:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0319 18:47:14.455169 543705 disk_worker.go:728] disk inode is not compliant
E0319 18:47:14.456862 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 18:47:14.456871 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 18:47:14.456877 543705 custom_config.go:64] query custom config with name: gpu
I0319 18:47:14.456949 543705 disk_worker.go:494] system disk:vda1
I0319 18:47:14.456994 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 18:47:15.456851 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 18:47:15.456860 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:47:16.457933 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 18:47:16.457933 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 18:47:16.457986 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:47:16.458006 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:47:16.472375 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:47:20.507225 543705 disk_info.go:125] begin check local disk info of client
I0319 18:47:20.509626 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:47:20.509632 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b7140 0xc0003b7180]
E0319 18:47:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:47:23.409792 543705 cpu.go:275] no items to output this cycle
I0319 18:47:23.409804 543705 memory.go:184] no items to output this cycle
E0319 18:47:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:47:33.409771 543705 memory.go:184] no items to output this cycle
I0319 18:47:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 18:47:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:47:43.409815 543705 memory.go:191] Add success.
I0319 18:47:43.409822 543705 cpu.go:282] Add success.
I0319 18:47:43.419738 543705 net.go:648] Add success.
I0319 18:47:43.422853 543705 net.go:770] primary dev: ETH0
I0319 18:47:43.422868 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:47:43.422880 543705 net.go:698] Add success.
I0319 18:47:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:47:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:47:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:47:53.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:47:53.409813 543705 memory.go:184] no items to output this cycle
I0319 18:47:53.409827 543705 cpu.go:275] no items to output this cycle
E0319 18:48:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:48:03.409808 543705 memory.go:184] no items to output this cycle
I0319 18:48:03.409821 543705 cpu.go:275] no items to output this cycle
E0319 18:48:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:48:13.409788 543705 memory.go:191] Add success.
I0319 18:48:13.409791 543705 cpu.go:282] Add success.
W0319 18:48:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:48:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:48:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:48:13.420314 543705 net.go:648] Add success.
I0319 18:48:13.423111 543705 net.go:770] primary dev: ETH0
I0319 18:48:13.423124 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:48:13.423137 543705 net.go:698] Add success.
I0319 18:48:13.469114 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d8b31f26-8c8c-453c-bac4-3ce918778a8a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 18:48:13.469150 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 18:48:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:48:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:48:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0319 18:48:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:48:14.456598 543705 disk_worker.go:494] system disk:vda1
I0319 18:48:14.456629 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:48:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:48:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:48:16.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:48:16.458085 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:48:16.472447 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:48:20.510240 543705 disk_info.go:125] begin check local disk info of client
I0319 18:48:20.512706 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:48:20.512712 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ba0c0 0xc0003ba100]
E0319 18:48:23.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:48:23.409821 543705 memory.go:184] no items to output this cycle
I0319 18:48:23.409826 543705 cpu.go:275] no items to output this cycle
E0319 18:48:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:48:33.409798 543705 memory.go:184] no items to output this cycle
I0319 18:48:33.409815 543705 cpu.go:275] no items to output this cycle
I0319 18:48:37.813743 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 18:48:37.813749 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 18:48:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:48:43.410870 543705 memory.go:191] Add success.
I0319 18:48:43.409817 543705 cpu.go:282] Add success.
I0319 18:48:43.420573 543705 net.go:648] Add success.
I0319 18:48:43.423178 543705 net.go:770] primary dev: ETH0
I0319 18:48:43.423193 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:48:43.423205 543705 net.go:698] Add success.
I0319 18:48:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:48:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:48:46.458089 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:48:53.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:48:53.409814 543705 memory.go:184] no items to output this cycle
I0319 18:48:53.409824 543705 cpu.go:275] no items to output this cycle
E0319 18:49:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:49:03.409782 543705 memory.go:184] no items to output this cycle
I0319 18:49:03.409786 543705 cpu.go:275] no items to output this cycle
E0319 18:49:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:49:13.409806 543705 memory.go:191] Add success.
I0319 18:49:13.409815 543705 cpu.go:282] Add success.
W0319 18:49:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:49:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:49:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:49:13.420251 543705 net.go:648] Add success.
I0319 18:49:13.423046 543705 net.go:770] primary dev: ETH0
I0319 18:49:13.423058 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:49:13.423070 543705 net.go:698] Add success.
I0319 18:49:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:49:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:49:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 18:49:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:49:14.456567 543705 disk_worker.go:494] system disk:vda1
I0319 18:49:14.456598 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:49:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:49:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:49:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:49:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:49:16.472393 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:49:20.513214 543705 disk_info.go:125] begin check local disk info of client
I0319 18:49:20.515617 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:49:20.515624 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b0300 0xc0003b0340]
E0319 18:49:23.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:49:23.409818 543705 memory.go:184] no items to output this cycle
I0319 18:49:23.409827 543705 cpu.go:275] no items to output this cycle
E0319 18:49:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:49:33.409793 543705 memory.go:184] no items to output this cycle
I0319 18:49:33.409808 543705 cpu.go:275] no items to output this cycle
E0319 18:49:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:49:43.409793 543705 memory.go:191] Add success.
I0319 18:49:43.409796 543705 cpu.go:282] Add success.
I0319 18:49:43.420024 543705 net.go:648] Add success.
I0319 18:49:43.422728 543705 net.go:770] primary dev: ETH0
I0319 18:49:43.422740 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:49:43.422753 543705 net.go:698] Add success.
I0319 18:49:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:49:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:49:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:49:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:49:53.409764 543705 memory.go:184] no items to output this cycle
I0319 18:49:53.409804 543705 cpu.go:275] no items to output this cycle
E0319 18:50:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:50:03.409781 543705 memory.go:184] no items to output this cycle
I0319 18:50:03.409800 543705 cpu.go:275] no items to output this cycle
E0319 18:50:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:50:13.409795 543705 cpu.go:282] Add success.
I0319 18:50:13.409802 543705 memory.go:191] Add success.
W0319 18:50:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:50:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:50:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:50:13.420061 543705 net.go:648] Add success.
I0319 18:50:13.422782 543705 net.go:770] primary dev: ETH0
I0319 18:50:13.422796 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:50:13.422808 543705 net.go:698] Add success.
I0319 18:50:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:50:14.455132 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:50:14.455235 543705 disk_worker.go:708] disk space is not compliant
W0319 18:50:14.455238 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:50:14.456619 543705 disk_worker.go:494] system disk:vda1
I0319 18:50:14.456652 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:50:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:50:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:50:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:50:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:50:16.472408 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:50:20.516281 543705 disk_info.go:125] begin check local disk info of client
I0319 18:50:20.518707 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:50:20.518712 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032af80 0xc00032afc0]
E0319 18:50:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:50:23.409811 543705 memory.go:184] no items to output this cycle
I0319 18:50:23.409828 543705 cpu.go:275] no items to output this cycle
E0319 18:50:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:50:33.409779 543705 memory.go:184] no items to output this cycle
I0319 18:50:33.409786 543705 cpu.go:275] no items to output this cycle
E0319 18:50:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:50:43.409891 543705 memory.go:191] Add success.
I0319 18:50:43.409892 543705 cpu.go:282] Add success.
I0319 18:50:43.419720 543705 net.go:648] Add success.
I0319 18:50:43.422483 543705 net.go:770] primary dev: ETH0
I0319 18:50:43.422498 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:50:43.422512 543705 net.go:698] Add success.
I0319 18:50:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:50:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:50:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:50:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:50:53.409779 543705 memory.go:184] no items to output this cycle
I0319 18:50:53.409823 543705 cpu.go:275] no items to output this cycle
E0319 18:51:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:51:03.409779 543705 memory.go:184] no items to output this cycle
I0319 18:51:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 18:51:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:51:13.409789 543705 memory.go:191] Add success.
W0319 18:51:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 18:51:13.409822 543705 cpu.go:282] Add success.
W0319 18:51:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:51:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:51:13.420109 543705 net.go:648] Add success.
I0319 18:51:13.422727 543705 net.go:770] primary dev: ETH0
I0319 18:51:13.422740 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:51:13.422752 543705 net.go:698] Add success.
I0319 18:51:13.468773 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"998d72f5-2ab1-4925-a2ce-75dbc08a0836","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 18:51:13.468805 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 18:51:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:51:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:51:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0319 18:51:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:51:14.456755 543705 disk_worker.go:494] system disk:vda1
I0319 18:51:14.456789 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:51:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:51:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:51:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:51:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:51:16.472426 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:51:20.519235 543705 disk_info.go:125] begin check local disk info of client
I0319 18:51:20.521684 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:51:20.521691 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035efc0 0xc00035f000]
E0319 18:51:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:51:23.409792 543705 memory.go:184] no items to output this cycle
I0319 18:51:23.409813 543705 cpu.go:275] no items to output this cycle
E0319 18:51:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:51:33.409767 543705 memory.go:184] no items to output this cycle
I0319 18:51:33.409819 543705 cpu.go:275] no items to output this cycle
I0319 18:51:37.813888 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 18:51:37.813894 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 18:51:43.409868 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:51:43.410886 543705 memory.go:191] Add success.
I0319 18:51:43.409904 543705 cpu.go:282] Add success.
I0319 18:51:43.419756 543705 net.go:648] Add success.
I0319 18:51:43.422819 543705 net.go:770] primary dev: ETH0
I0319 18:51:43.422834 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:51:43.422849 543705 net.go:698] Add success.
I0319 18:51:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:51:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:51:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:51:53.409804 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:51:53.409826 543705 memory.go:184] no items to output this cycle
I0319 18:51:53.409836 543705 cpu.go:275] no items to output this cycle
E0319 18:52:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:52:03.409807 543705 memory.go:184] no items to output this cycle
I0319 18:52:03.409820 543705 cpu.go:275] no items to output this cycle
E0319 18:52:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:52:13.409789 543705 memory.go:191] Add success.
I0319 18:52:13.409805 543705 cpu.go:282] Add success.
W0319 18:52:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:52:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:52:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:52:13.420123 543705 net.go:648] Add success.
I0319 18:52:13.422852 543705 net.go:770] primary dev: ETH0
I0319 18:52:13.422866 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:52:13.422880 543705 net.go:698] Add success.
W0319 18:52:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:52:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 18:52:14.455191 543705 disk_worker.go:728] disk inode is not compliant
E0319 18:52:14.455918 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 18:52:14.455926 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 18:52:14.455932 543705 custom_config.go:64] query custom config with name: gpu
I0319 18:52:14.456580 543705 disk_worker.go:494] system disk:vda1
I0319 18:52:14.456611 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 18:52:15.456837 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 18:52:15.456847 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:52:16.457949 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 18:52:16.457951 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 18:52:16.458004 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:52:16.458023 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:52:16.472359 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:52:20.522250 543705 disk_info.go:125] begin check local disk info of client
I0319 18:52:20.524613 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:52:20.524618 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037ea80 0xc00037eac0]
E0319 18:52:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:52:23.409792 543705 memory.go:184] no items to output this cycle
I0319 18:52:23.409791 543705 cpu.go:275] no items to output this cycle
E0319 18:52:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:52:33.409770 543705 memory.go:184] no items to output this cycle
I0319 18:52:33.409792 543705 cpu.go:275] no items to output this cycle
E0319 18:52:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:52:43.409902 543705 memory.go:191] Add success.
I0319 18:52:43.409982 543705 cpu.go:282] Add success.
I0319 18:52:43.419713 543705 net.go:648] Add success.
I0319 18:52:43.422621 543705 net.go:770] primary dev: ETH0
I0319 18:52:43.422634 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:52:43.422646 543705 net.go:698] Add success.
I0319 18:52:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:52:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:52:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:52:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:52:53.409781 543705 memory.go:184] no items to output this cycle
I0319 18:52:53.409799 543705 cpu.go:275] no items to output this cycle
E0319 18:53:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:53:03.409798 543705 memory.go:184] no items to output this cycle
I0319 18:53:03.409811 543705 cpu.go:275] no items to output this cycle
E0319 18:53:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:53:13.409817 543705 memory.go:191] Add success.
I0319 18:53:13.409827 543705 cpu.go:282] Add success.
W0319 18:53:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:53:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:53:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:53:13.420179 543705 net.go:648] Add success.
I0319 18:53:13.422987 543705 net.go:770] primary dev: ETH0
I0319 18:53:13.423002 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:53:13.423016 543705 net.go:698] Add success.
I0319 18:53:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:53:14.455084 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:53:14.455144 543705 disk_worker.go:708] disk space is not compliant
W0319 18:53:14.455147 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:53:14.456466 543705 disk_worker.go:494] system disk:vda1
I0319 18:53:14.456510 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:53:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:53:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:53:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:53:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:53:16.472441 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:53:20.525321 543705 disk_info.go:125] begin check local disk info of client
I0319 18:53:20.527851 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:53:20.527858 543705 disk_info.go:196] parse disk info done, disk is : [0xc000579980 0xc0005799c0]
E0319 18:53:23.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:53:23.409817 543705 memory.go:184] no items to output this cycle
I0319 18:53:23.409826 543705 cpu.go:275] no items to output this cycle
E0319 18:53:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:53:33.409794 543705 memory.go:184] no items to output this cycle
I0319 18:53:33.409810 543705 cpu.go:275] no items to output this cycle
E0319 18:53:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:53:43.409783 543705 memory.go:191] Add success.
I0319 18:53:43.409807 543705 cpu.go:282] Add success.
I0319 18:53:43.420420 543705 net.go:648] Add success.
I0319 18:53:43.423338 543705 net.go:770] primary dev: ETH0
I0319 18:53:43.423353 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:53:43.423367 543705 net.go:698] Add success.
I0319 18:53:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:53:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:53:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:53:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:53:53.409781 543705 memory.go:184] no items to output this cycle
I0319 18:53:53.409788 543705 cpu.go:275] no items to output this cycle
E0319 18:54:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:54:03.409774 543705 memory.go:184] no items to output this cycle
I0319 18:54:03.409799 543705 cpu.go:275] no items to output this cycle
E0319 18:54:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:54:13.409820 543705 memory.go:191] Add success.
I0319 18:54:13.409825 543705 cpu.go:282] Add success.
W0319 18:54:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:54:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:54:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:54:13.420099 543705 net.go:648] Add success.
I0319 18:54:13.422757 543705 net.go:770] primary dev: ETH0
I0319 18:54:13.422773 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:54:13.422787 543705 net.go:698] Add success.
I0319 18:54:13.516372 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"14255c05-6d02-417d-b03e-568b5f5bddc5","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 18:54:13.516405 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 18:54:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:54:14.455191 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:54:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0319 18:54:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:54:14.456694 543705 disk_worker.go:494] system disk:vda1
I0319 18:54:14.456729 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:54:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:54:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:54:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:54:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:54:16.472391 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:54:20.527940 543705 disk_info.go:125] begin check local disk info of client
I0319 18:54:20.530410 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:54:20.530416 543705 disk_info.go:196] parse disk info done, disk is : [0xc000579640 0xc000579680]
E0319 18:54:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:54:23.409804 543705 memory.go:184] no items to output this cycle
I0319 18:54:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 18:54:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:54:33.409767 543705 memory.go:184] no items to output this cycle
I0319 18:54:33.409796 543705 cpu.go:275] no items to output this cycle
I0319 18:54:37.815782 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 18:54:37.815788 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 18:54:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:54:43.411068 543705 memory.go:191] Add success.
I0319 18:54:43.409819 543705 cpu.go:282] Add success.
I0319 18:54:43.419756 543705 net.go:648] Add success.
I0319 18:54:43.422579 543705 net.go:770] primary dev: ETH0
I0319 18:54:43.422592 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:54:43.422602 543705 net.go:698] Add success.
I0319 18:54:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:54:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:54:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:54:53.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:54:53.409817 543705 memory.go:184] no items to output this cycle
I0319 18:54:53.409825 543705 cpu.go:275] no items to output this cycle
E0319 18:55:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:55:03.409780 543705 memory.go:184] no items to output this cycle
I0319 18:55:03.409783 543705 cpu.go:275] no items to output this cycle
E0319 18:55:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:55:13.409797 543705 memory.go:191] Add success.
I0319 18:55:13.409799 543705 cpu.go:282] Add success.
W0319 18:55:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:55:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:55:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:55:13.420248 543705 net.go:648] Add success.
I0319 18:55:13.422983 543705 net.go:770] primary dev: ETH0
I0319 18:55:13.422996 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:55:13.423012 543705 net.go:698] Add success.
I0319 18:55:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:55:14.455148 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:55:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0319 18:55:14.455161 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:55:14.456484 543705 disk_worker.go:494] system disk:vda1
I0319 18:55:14.456529 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:55:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:55:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:55:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:55:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:55:16.472405 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:55:20.531348 543705 disk_info.go:125] begin check local disk info of client
I0319 18:55:20.533792 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:55:20.533798 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a8500 0xc0004a8540]
E0319 18:55:23.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:55:23.409814 543705 memory.go:184] no items to output this cycle
I0319 18:55:23.409821 543705 cpu.go:275] no items to output this cycle
E0319 18:55:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:55:33.409778 543705 memory.go:184] no items to output this cycle
I0319 18:55:33.409784 543705 cpu.go:275] no items to output this cycle
E0319 18:55:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:55:43.409780 543705 memory.go:191] Add success.
I0319 18:55:43.409812 543705 cpu.go:282] Add success.
I0319 18:55:43.420161 543705 net.go:648] Add success.
I0319 18:55:43.422989 543705 net.go:770] primary dev: ETH0
I0319 18:55:43.423005 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:55:43.423020 543705 net.go:698] Add success.
I0319 18:55:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:55:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:55:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:55:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:55:53.409787 543705 memory.go:184] no items to output this cycle
I0319 18:55:53.409789 543705 cpu.go:275] no items to output this cycle
E0319 18:56:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:56:03.409804 543705 memory.go:184] no items to output this cycle
I0319 18:56:03.409818 543705 cpu.go:275] no items to output this cycle
E0319 18:56:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:56:13.409818 543705 memory.go:191] Add success.
I0319 18:56:13.409830 543705 cpu.go:282] Add success.
W0319 18:56:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:56:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:56:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:56:13.420058 543705 net.go:648] Add success.
I0319 18:56:13.422773 543705 net.go:770] primary dev: ETH0
I0319 18:56:13.422785 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:56:13.422798 543705 net.go:698] Add success.
I0319 18:56:14.454980 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:56:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:56:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0319 18:56:14.455182 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:56:14.456509 543705 disk_worker.go:494] system disk:vda1
I0319 18:56:14.456553 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:56:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:56:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:56:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:56:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:56:16.472405 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:56:20.534304 543705 disk_info.go:125] begin check local disk info of client
I0319 18:56:20.536717 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:56:20.536723 543705 disk_info.go:196] parse disk info done, disk is : [0xc000578480 0xc0005784c0]
E0319 18:56:23.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:56:23.409822 543705 memory.go:184] no items to output this cycle
I0319 18:56:23.409827 543705 cpu.go:275] no items to output this cycle
E0319 18:56:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:56:33.409792 543705 memory.go:184] no items to output this cycle
I0319 18:56:33.409810 543705 cpu.go:275] no items to output this cycle
E0319 18:56:43.409951 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:56:43.409980 543705 cpu.go:282] Add success.
I0319 18:56:43.409982 543705 memory.go:191] Add success.
I0319 18:56:43.419717 543705 net.go:648] Add success.
I0319 18:56:43.422825 543705 net.go:770] primary dev: ETH0
I0319 18:56:43.422838 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:56:43.422850 543705 net.go:698] Add success.
I0319 18:56:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:56:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:56:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:56:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:56:53.409788 543705 memory.go:184] no items to output this cycle
I0319 18:56:53.409824 543705 cpu.go:275] no items to output this cycle
E0319 18:57:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:57:03.409792 543705 memory.go:184] no items to output this cycle
I0319 18:57:03.409797 543705 cpu.go:275] no items to output this cycle
E0319 18:57:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:57:13.409796 543705 cpu.go:282] Add success.
I0319 18:57:13.409802 543705 memory.go:191] Add success.
W0319 18:57:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:57:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:57:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:57:13.420071 543705 net.go:648] Add success.
I0319 18:57:13.428982 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 18:57:13.429055 543705 net.go:770] primary dev: ETH0
I0319 18:57:13.429067 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:57:13.429080 543705 net.go:698] Add success.
I0319 18:57:13.453620 543705 event_worker.go:152] Polling the log file for events...
I0319 18:57:13.463869 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e7587f43-aa93-4b4e-817b-68dd9cbf4962","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 18:57:13.463906 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 18:57:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:57:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0319 18:57:14.455174 543705 disk_worker.go:728] disk inode is not compliant
E0319 18:57:14.455886 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 18:57:14.455894 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 18:57:14.455899 543705 custom_config.go:64] query custom config with name: gpu
I0319 18:57:14.456625 543705 disk_worker.go:494] system disk:vda1
I0319 18:57:14.456660 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 18:57:15.456828 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 18:57:15.456837 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:57:16.457949 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 18:57:16.457948 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 18:57:16.458003 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:57:16.458023 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:57:16.472332 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:57:20.537321 543705 disk_info.go:125] begin check local disk info of client
I0319 18:57:20.539690 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:57:20.539696 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001bc4c0 0xc0001bc500]
E0319 18:57:23.409859 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:57:23.409885 543705 memory.go:184] no items to output this cycle
I0319 18:57:23.409891 543705 cpu.go:275] no items to output this cycle
E0319 18:57:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:57:33.409782 543705 memory.go:184] no items to output this cycle
I0319 18:57:33.409805 543705 cpu.go:275] no items to output this cycle
I0319 18:57:37.817733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 18:57:37.817739 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 18:57:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:57:43.410657 543705 memory.go:191] Add success.
I0319 18:57:43.409819 543705 cpu.go:282] Add success.
I0319 18:57:43.420398 543705 net.go:648] Add success.
I0319 18:57:43.423157 543705 net.go:770] primary dev: ETH0
I0319 18:57:43.423172 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:57:43.423187 543705 net.go:698] Add success.
I0319 18:57:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:57:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:57:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:57:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:57:53.409769 543705 memory.go:184] no items to output this cycle
I0319 18:57:53.409805 543705 cpu.go:275] no items to output this cycle
E0319 18:58:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:58:03.409781 543705 memory.go:184] no items to output this cycle
I0319 18:58:03.409810 543705 cpu.go:275] no items to output this cycle
E0319 18:58:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:58:13.409789 543705 memory.go:191] Add success.
I0319 18:58:13.409790 543705 cpu.go:282] Add success.
W0319 18:58:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:58:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:58:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:58:13.420070 543705 net.go:648] Add success.
I0319 18:58:13.423306 543705 net.go:770] primary dev: ETH0
I0319 18:58:13.423320 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:58:13.423335 543705 net.go:698] Add success.
I0319 18:58:14.454981 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:58:14.455139 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:58:14.455229 543705 disk_worker.go:708] disk space is not compliant
W0319 18:58:14.455232 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:58:14.456802 543705 disk_worker.go:494] system disk:vda1
I0319 18:58:14.456845 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:58:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:58:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:58:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:58:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:58:16.472420 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:58:20.539779 543705 disk_info.go:125] begin check local disk info of client
I0319 18:58:20.542353 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:58:20.542360 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034e240 0xc00034e280]
E0319 18:58:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:58:23.409893 543705 cpu.go:275] no items to output this cycle
I0319 18:58:23.409916 543705 memory.go:184] no items to output this cycle
E0319 18:58:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:58:33.409778 543705 memory.go:184] no items to output this cycle
I0319 18:58:33.409793 543705 cpu.go:275] no items to output this cycle
E0319 18:58:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:58:43.409785 543705 memory.go:191] Add success.
I0319 18:58:43.409811 543705 cpu.go:282] Add success.
I0319 18:58:43.420056 543705 net.go:648] Add success.
I0319 18:58:43.422796 543705 net.go:770] primary dev: ETH0
I0319 18:58:43.422810 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:58:43.422822 543705 net.go:698] Add success.
I0319 18:58:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:58:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:58:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:58:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:58:53.409805 543705 memory.go:184] no items to output this cycle
I0319 18:58:53.409812 543705 cpu.go:275] no items to output this cycle
E0319 18:59:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:59:03.409772 543705 memory.go:184] no items to output this cycle
I0319 18:59:03.409787 543705 cpu.go:275] no items to output this cycle
E0319 18:59:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:59:13.409808 543705 memory.go:191] Add success.
I0319 18:59:13.409816 543705 cpu.go:282] Add success.
W0319 18:59:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 18:59:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 18:59:13.409859 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 18:59:13.420097 543705 net.go:648] Add success.
I0319 18:59:13.422815 543705 net.go:770] primary dev: ETH0
I0319 18:59:13.422830 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:59:13.422841 543705 net.go:698] Add success.
I0319 18:59:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 18:59:14.455145 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 18:59:14.455155 543705 disk_worker.go:708] disk space is not compliant
W0319 18:59:14.455158 543705 disk_worker.go:728] disk inode is not compliant
I0319 18:59:14.456502 543705 disk_worker.go:494] system disk:vda1
I0319 18:59:14.456547 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 18:59:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 18:59:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:59:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:59:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 18:59:16.472375 543705 disk_local_worker.go:436] Get disk info: []
I0319 18:59:20.543399 543705 disk_info.go:125] begin check local disk info of client
I0319 18:59:20.545895 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 18:59:20.545907 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b140 0xc00007b180]
E0319 18:59:23.409894 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:59:23.409899 543705 cpu.go:275] no items to output this cycle
I0319 18:59:23.409922 543705 memory.go:184] no items to output this cycle
E0319 18:59:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:59:33.409784 543705 cpu.go:275] no items to output this cycle
I0319 18:59:33.409794 543705 memory.go:184] no items to output this cycle
E0319 18:59:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:59:43.409790 543705 memory.go:191] Add success.
I0319 18:59:43.409792 543705 cpu.go:282] Add success.
I0319 18:59:43.420027 543705 net.go:648] Add success.
I0319 18:59:43.422783 543705 net.go:770] primary dev: ETH0
I0319 18:59:43.422796 543705 net.go:802] Send network stats successfully!,count is 6
I0319 18:59:43.422809 543705 net.go:698] Add success.
I0319 18:59:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 18:59:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 18:59:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 18:59:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 18:59:53.409781 543705 memory.go:184] no items to output this cycle
I0319 18:59:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 19:00:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:00:03.409775 543705 memory.go:184] no items to output this cycle
I0319 19:00:03.409796 543705 cpu.go:275] no items to output this cycle
E0319 19:00:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:00:13.409790 543705 memory.go:191] Add success.
I0319 19:00:13.409797 543705 cpu.go:282] Add success.
W0319 19:00:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:00:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:00:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:00:13.420132 543705 net.go:648] Add success.
I0319 19:00:13.423371 543705 net.go:770] primary dev: ETH0
I0319 19:00:13.423386 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:00:13.423401 543705 net.go:698] Add success.
I0319 19:00:13.501373 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"6884e0ed-abff-4620-9a57-ee0ddf245f82","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 19:00:13.501406 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 19:00:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:00:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:00:14.455231 543705 disk_worker.go:708] disk space is not compliant
W0319 19:00:14.455234 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:00:14.456742 543705 disk_worker.go:494] system disk:vda1
I0319 19:00:14.456778 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:00:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:00:16.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:00:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:00:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:00:16.472420 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:00:20.545994 543705 disk_info.go:125] begin check local disk info of client
I0319 19:00:20.548554 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:00:20.548560 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003426c0 0xc000342700]
E0319 19:00:23.409886 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:00:23.409906 543705 cpu.go:275] no items to output this cycle
I0319 19:00:23.409909 543705 memory.go:184] no items to output this cycle
E0319 19:00:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:00:33.409805 543705 memory.go:184] no items to output this cycle
I0319 19:00:33.409820 543705 cpu.go:275] no items to output this cycle
I0319 19:00:37.817888 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 19:00:37.817894 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 19:00:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:00:43.410827 543705 memory.go:191] Add success.
I0319 19:00:43.409830 543705 cpu.go:282] Add success.
I0319 19:00:43.420557 543705 net.go:648] Add success.
I0319 19:00:43.423315 543705 net.go:770] primary dev: ETH0
I0319 19:00:43.423328 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:00:43.423342 543705 net.go:698] Add success.
I0319 19:00:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:00:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:00:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:00:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:00:53.409785 543705 memory.go:184] no items to output this cycle
I0319 19:00:53.409797 543705 cpu.go:275] no items to output this cycle
E0319 19:01:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:01:03.409803 543705 memory.go:184] no items to output this cycle
I0319 19:01:03.409816 543705 cpu.go:275] no items to output this cycle
E0319 19:01:13.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:01:13.409776 543705 memory.go:191] Add success.
W0319 19:01:13.409802 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 19:01:13.409809 543705 cpu.go:282] Add success.
W0319 19:01:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:01:13.409817 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:01:13.420035 543705 net.go:648] Add success.
I0319 19:01:13.422991 543705 net.go:770] primary dev: ETH0
I0319 19:01:13.423005 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:01:13.423017 543705 net.go:698] Add success.
I0319 19:01:14.454978 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:01:14.455174 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:01:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0319 19:01:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:01:14.456533 543705 disk_worker.go:494] system disk:vda1
I0319 19:01:14.456575 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:01:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:01:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:01:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:01:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:01:16.472411 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:01:20.549444 543705 disk_info.go:125] begin check local disk info of client
I0319 19:01:20.551862 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:01:20.551868 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e2000 0xc0003e2040]
E0319 19:01:23.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:01:23.409822 543705 memory.go:184] no items to output this cycle
I0319 19:01:23.409825 543705 cpu.go:275] no items to output this cycle
E0319 19:01:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:01:33.409765 543705 memory.go:184] no items to output this cycle
I0319 19:01:33.409811 543705 cpu.go:275] no items to output this cycle
E0319 19:01:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:01:43.409788 543705 memory.go:191] Add success.
I0319 19:01:43.409805 543705 cpu.go:282] Add success.
I0319 19:01:43.419967 543705 net.go:648] Add success.
I0319 19:01:43.422883 543705 net.go:770] primary dev: ETH0
I0319 19:01:43.422897 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:01:43.422909 543705 net.go:698] Add success.
I0319 19:01:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:01:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:01:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:01:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:01:53.409774 543705 memory.go:184] no items to output this cycle
I0319 19:01:53.409793 543705 cpu.go:275] no items to output this cycle
E0319 19:02:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:02:03.409809 543705 memory.go:184] no items to output this cycle
I0319 19:02:03.409824 543705 cpu.go:275] no items to output this cycle
E0319 19:02:13.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:02:13.409773 543705 memory.go:191] Add success.
W0319 19:02:13.409799 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 19:02:13.409800 543705 cpu.go:282] Add success.
W0319 19:02:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:02:13.409813 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:02:13.420058 543705 net.go:648] Add success.
I0319 19:02:13.422806 543705 net.go:770] primary dev: ETH0
I0319 19:02:13.422822 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:02:13.422838 543705 net.go:698] Add success.
W0319 19:02:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:02:14.455261 543705 disk_worker.go:708] disk space is not compliant
W0319 19:02:14.455266 543705 disk_worker.go:728] disk inode is not compliant
E0319 19:02:14.455933 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 19:02:14.455943 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 19:02:14.455949 543705 custom_config.go:64] query custom config with name: gpu
I0319 19:02:14.456853 543705 disk_worker.go:494] system disk:vda1
I0319 19:02:14.456884 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 19:02:15.456831 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 19:02:15.456840 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:02:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 19:02:16.457992 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 19:02:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:02:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:02:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:02:20.552397 543705 disk_info.go:125] begin check local disk info of client
I0319 19:02:20.554774 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:02:20.554780 543705 disk_info.go:196] parse disk info done, disk is : [0xc000354ec0 0xc000354f00]
E0319 19:02:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:02:23.409801 543705 memory.go:184] no items to output this cycle
I0319 19:02:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 19:02:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:02:33.409808 543705 memory.go:184] no items to output this cycle
I0319 19:02:33.409820 543705 cpu.go:275] no items to output this cycle
E0319 19:02:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:02:43.409787 543705 memory.go:191] Add success.
I0319 19:02:43.409806 543705 cpu.go:282] Add success.
I0319 19:02:43.419887 543705 net.go:648] Add success.
I0319 19:02:43.422573 543705 net.go:770] primary dev: ETH0
I0319 19:02:43.422587 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:02:43.422599 543705 net.go:698] Add success.
I0319 19:02:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:02:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:02:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:02:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:02:53.409810 543705 memory.go:184] no items to output this cycle
I0319 19:02:53.409817 543705 cpu.go:275] no items to output this cycle
E0319 19:03:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:03:03.409802 543705 memory.go:184] no items to output this cycle
I0319 19:03:03.409816 543705 cpu.go:275] no items to output this cycle
E0319 19:03:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:03:13.409782 543705 memory.go:191] Add success.
W0319 19:03:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:03:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:03:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:03:13.409832 543705 cpu.go:282] Add success.
I0319 19:03:13.420039 543705 net.go:648] Add success.
I0319 19:03:13.422524 543705 net.go:770] primary dev: ETH0
I0319 19:03:13.422536 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:03:13.422550 543705 net.go:698] Add success.
I0319 19:03:13.463891 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7c238f65-6425-49f7-9f84-1ad25baba171","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 19:03:13.463925 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 19:03:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:03:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:03:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0319 19:03:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:03:14.456599 543705 disk_worker.go:494] system disk:vda1
I0319 19:03:14.456634 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:03:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:03:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:03:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:03:16.458077 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:03:16.472513 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:03:20.554862 543705 disk_info.go:125] begin check local disk info of client
I0319 19:03:20.557470 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:03:20.557477 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002903c0 0xc000290400]
I0319 19:03:23.409943 543705 cpu.go:275] no items to output this cycle
E0319 19:03:23.409944 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:03:23.409971 543705 memory.go:184] no items to output this cycle
E0319 19:03:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:03:33.409790 543705 memory.go:184] no items to output this cycle
I0319 19:03:33.409804 543705 cpu.go:275] no items to output this cycle
I0319 19:03:37.819790 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 19:03:37.819797 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 19:03:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:03:43.411147 543705 memory.go:191] Add success.
I0319 19:03:43.409814 543705 cpu.go:282] Add success.
I0319 19:03:43.419870 543705 net.go:648] Add success.
I0319 19:03:43.422794 543705 net.go:770] primary dev: ETH0
I0319 19:03:43.422815 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:03:43.422835 543705 net.go:698] Add success.
I0319 19:03:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:03:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:03:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:03:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:03:53.409783 543705 cpu.go:275] no items to output this cycle
I0319 19:03:53.409785 543705 memory.go:184] no items to output this cycle
E0319 19:04:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:04:03.409791 543705 memory.go:184] no items to output this cycle
I0319 19:04:03.409792 543705 cpu.go:275] no items to output this cycle
W0319 19:04:13.409710 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:04:13.409728 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:04:13.409734 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:04:13.409787 543705 cpu.go:282] Add success.
E0319 19:04:13.409831 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:04:13.409852 543705 memory.go:191] Add success.
I0319 19:04:13.420052 543705 net.go:648] Add success.
I0319 19:04:13.423383 543705 net.go:770] primary dev: ETH0
I0319 19:04:13.423400 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:04:13.423414 543705 net.go:698] Add success.
I0319 19:04:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:04:14.455094 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:04:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0319 19:04:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:04:14.456510 543705 disk_worker.go:494] system disk:vda1
I0319 19:04:14.456553 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:04:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:04:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:04:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:04:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:04:16.472386 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:04:20.558482 543705 disk_info.go:125] begin check local disk info of client
I0319 19:04:20.560992 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:04:20.560998 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e7000 0xc0003e7040]
E0319 19:04:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:04:23.409784 543705 memory.go:184] no items to output this cycle
I0319 19:04:23.409963 543705 cpu.go:275] no items to output this cycle
E0319 19:04:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:04:33.409793 543705 memory.go:184] no items to output this cycle
I0319 19:04:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 19:04:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:04:43.409795 543705 cpu.go:282] Add success.
I0319 19:04:43.409804 543705 memory.go:191] Add success.
I0319 19:04:43.419858 543705 net.go:648] Add success.
I0319 19:04:43.422855 543705 net.go:770] primary dev: ETH0
I0319 19:04:43.422868 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:04:43.422880 543705 net.go:698] Add success.
I0319 19:04:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:04:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:04:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:04:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:04:53.409784 543705 memory.go:184] no items to output this cycle
I0319 19:04:53.409789 543705 cpu.go:275] no items to output this cycle
E0319 19:05:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:05:03.409786 543705 memory.go:184] no items to output this cycle
I0319 19:05:03.409800 543705 cpu.go:275] no items to output this cycle
E0319 19:05:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:05:13.409804 543705 memory.go:191] Add success.
I0319 19:05:13.409818 543705 cpu.go:282] Add success.
W0319 19:05:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:05:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:05:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:05:13.420128 543705 net.go:648] Add success.
I0319 19:05:13.422857 543705 net.go:770] primary dev: ETH0
I0319 19:05:13.422870 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:05:13.422881 543705 net.go:698] Add success.
I0319 19:05:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:05:14.455187 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:05:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0319 19:05:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:05:14.456553 543705 disk_worker.go:494] system disk:vda1
I0319 19:05:14.456581 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:05:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:05:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:05:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:05:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:05:16.472392 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:05:20.561084 543705 disk_info.go:125] begin check local disk info of client
I0319 19:05:20.563549 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:05:20.563555 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004e8f00 0xc0004e8f40]
I0319 19:05:23.409792 543705 cpu.go:275] no items to output this cycle
E0319 19:05:23.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:05:23.409810 543705 memory.go:184] no items to output this cycle
E0319 19:05:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:05:33.409792 543705 memory.go:184] no items to output this cycle
I0319 19:05:33.409799 543705 cpu.go:275] no items to output this cycle
E0319 19:05:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:05:43.409803 543705 memory.go:191] Add success.
I0319 19:05:43.409804 543705 cpu.go:282] Add success.
I0319 19:05:43.419890 543705 net.go:648] Add success.
I0319 19:05:43.422719 543705 net.go:770] primary dev: ETH0
I0319 19:05:43.422733 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:05:43.422745 543705 net.go:698] Add success.
I0319 19:05:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:05:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:05:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:05:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:05:53.409777 543705 memory.go:184] no items to output this cycle
I0319 19:05:53.409785 543705 cpu.go:275] no items to output this cycle
E0319 19:06:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:06:03.409786 543705 memory.go:184] no items to output this cycle
I0319 19:06:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 19:06:13.410507 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:06:13.410537 543705 memory.go:191] Add success.
I0319 19:06:13.410552 543705 cpu.go:282] Add success.
W0319 19:06:13.410565 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:06:13.410578 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:06:13.410582 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:06:13.419655 543705 net.go:770] primary dev: ETH0
I0319 19:06:13.419669 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:06:13.419681 543705 net.go:698] Add success.
I0319 19:06:13.420028 543705 net.go:648] Add success.
I0319 19:06:13.474203 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e327f2aa-5e67-42fd-9a14-c5ecd5092d08","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 19:06:13.474236 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 19:06:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:06:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:06:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 19:06:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:06:14.456692 543705 disk_worker.go:494] system disk:vda1
I0319 19:06:14.456728 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:06:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:06:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:06:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:06:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:06:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:06:20.564524 543705 disk_info.go:125] begin check local disk info of client
I0319 19:06:20.566954 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:06:20.566960 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004e8640 0xc0004e8680]
E0319 19:06:23.409802 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:06:23.409807 543705 cpu.go:275] no items to output this cycle
I0319 19:06:23.409977 543705 memory.go:184] no items to output this cycle
E0319 19:06:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:06:33.409793 543705 memory.go:184] no items to output this cycle
I0319 19:06:33.409802 543705 cpu.go:275] no items to output this cycle
I0319 19:06:37.821742 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 19:06:37.821749 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 19:06:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:06:43.410742 543705 memory.go:191] Add success.
I0319 19:06:43.409800 543705 cpu.go:282] Add success.
I0319 19:06:43.420444 543705 net.go:648] Add success.
I0319 19:06:43.422892 543705 net.go:770] primary dev: ETH0
I0319 19:06:43.422907 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:06:43.422921 543705 net.go:698] Add success.
I0319 19:06:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:06:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:06:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:06:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:06:53.409785 543705 memory.go:184] no items to output this cycle
I0319 19:06:53.409788 543705 cpu.go:275] no items to output this cycle
E0319 19:07:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:07:03.409779 543705 cpu.go:275] no items to output this cycle
I0319 19:07:03.409780 543705 memory.go:184] no items to output this cycle
E0319 19:07:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:07:13.409792 543705 memory.go:191] Add success.
I0319 19:07:13.409795 543705 cpu.go:282] Add success.
W0319 19:07:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:07:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:07:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:07:13.420112 543705 net.go:648] Add success.
I0319 19:07:13.422750 543705 net.go:770] primary dev: ETH0
I0319 19:07:13.422765 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:07:13.422778 543705 net.go:698] Add success.
I0319 19:07:13.453308 543705 event_worker.go:152] Polling the log file for events...
W0319 19:07:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:07:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0319 19:07:14.455173 543705 disk_worker.go:728] disk inode is not compliant
E0319 19:07:14.456963 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 19:07:14.456973 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 19:07:14.456979 543705 custom_config.go:64] query custom config with name: gpu
I0319 19:07:14.457043 543705 disk_worker.go:494] system disk:vda1
I0319 19:07:14.457074 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 19:07:15.456841 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 19:07:15.456850 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:07:16.458032 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 19:07:16.458040 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 19:07:16.458084 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:07:16.458101 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:07:16.472470 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:07:20.567476 543705 disk_info.go:125] begin check local disk info of client
I0319 19:07:20.569903 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:07:20.569909 543705 disk_info.go:196] parse disk info done, disk is : [0xc000475880 0xc0004758c0]
E0319 19:07:23.409955 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:07:23.410039 543705 memory.go:184] no items to output this cycle
I0319 19:07:23.410053 543705 cpu.go:275] no items to output this cycle
E0319 19:07:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:07:33.409772 543705 memory.go:184] no items to output this cycle
I0319 19:07:33.409816 543705 cpu.go:275] no items to output this cycle
E0319 19:07:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:07:43.409795 543705 memory.go:191] Add success.
I0319 19:07:43.409812 543705 cpu.go:282] Add success.
I0319 19:07:43.419769 543705 net.go:770] primary dev: ETH0
I0319 19:07:43.419782 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:07:43.419795 543705 net.go:698] Add success.
I0319 19:07:43.420042 543705 net.go:648] Add success.
I0319 19:07:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:07:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:07:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:07:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:07:53.409793 543705 memory.go:184] no items to output this cycle
I0319 19:07:53.409805 543705 cpu.go:275] no items to output this cycle
E0319 19:08:03.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:08:03.409813 543705 memory.go:184] no items to output this cycle
I0319 19:08:03.409824 543705 cpu.go:275] no items to output this cycle
E0319 19:08:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:08:13.409799 543705 cpu.go:282] Add success.
I0319 19:08:13.409802 543705 memory.go:191] Add success.
W0319 19:08:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:08:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:08:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:08:13.420055 543705 net.go:648] Add success.
I0319 19:08:13.422768 543705 net.go:770] primary dev: ETH0
I0319 19:08:13.422784 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:08:13.422796 543705 net.go:698] Add success.
I0319 19:08:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:08:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:08:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0319 19:08:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:08:14.456513 543705 disk_worker.go:494] system disk:vda1
I0319 19:08:14.456561 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:08:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:08:16.458240 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:08:16.458303 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:08:16.458328 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:08:16.472686 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:08:20.569996 543705 disk_info.go:125] begin check local disk info of client
I0319 19:08:20.572468 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:08:20.572475 543705 disk_info.go:196] parse disk info done, disk is : [0xc00029c000 0xc00029c040]
E0319 19:08:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:08:23.409796 543705 memory.go:184] no items to output this cycle
I0319 19:08:23.409845 543705 cpu.go:275] no items to output this cycle
E0319 19:08:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:08:33.409796 543705 memory.go:184] no items to output this cycle
I0319 19:08:33.409814 543705 cpu.go:275] no items to output this cycle
E0319 19:08:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:08:43.409801 543705 memory.go:191] Add success.
I0319 19:08:43.409806 543705 cpu.go:282] Add success.
I0319 19:08:43.420043 543705 net.go:648] Add success.
I0319 19:08:43.422846 543705 net.go:770] primary dev: ETH0
I0319 19:08:43.422860 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:08:43.422873 543705 net.go:698] Add success.
I0319 19:08:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:08:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:08:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:08:53.409821 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:08:53.409843 543705 memory.go:184] no items to output this cycle
I0319 19:08:53.409854 543705 cpu.go:275] no items to output this cycle
E0319 19:09:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:09:03.409792 543705 memory.go:184] no items to output this cycle
I0319 19:09:03.409797 543705 cpu.go:275] no items to output this cycle
E0319 19:09:13.409803 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:09:13.409841 543705 memory.go:191] Add success.
I0319 19:09:13.409846 543705 cpu.go:282] Add success.
W0319 19:09:13.409873 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:09:13.409889 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:09:13.409893 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:09:13.420164 543705 net.go:648] Add success.
I0319 19:09:13.422804 543705 net.go:770] primary dev: ETH0
I0319 19:09:13.422817 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:09:13.422828 543705 net.go:698] Add success.
I0319 19:09:13.468609 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b47bcb33-be50-4324-bd55-7cb8d15ccbfa","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 19:09:13.468641 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 19:09:14.454978 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:09:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:09:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0319 19:09:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:09:14.456647 543705 disk_worker.go:494] system disk:vda1
I0319 19:09:14.456679 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:09:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:09:16.457579 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:09:16.457662 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:09:16.457689 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:09:16.473055 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:09:20.573549 543705 disk_info.go:125] begin check local disk info of client
I0319 19:09:20.576016 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:09:20.576022 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002be080 0xc0002be0c0]
E0319 19:09:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:09:23.409778 543705 memory.go:184] no items to output this cycle
I0319 19:09:23.409847 543705 cpu.go:275] no items to output this cycle
E0319 19:09:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:09:33.409801 543705 cpu.go:275] no items to output this cycle
I0319 19:09:33.409801 543705 memory.go:184] no items to output this cycle
I0319 19:09:37.821887 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 19:09:37.821893 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 19:09:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:09:43.410617 543705 memory.go:191] Add success.
I0319 19:09:43.409810 543705 cpu.go:282] Add success.
I0319 19:09:43.420340 543705 net.go:648] Add success.
I0319 19:09:43.423044 543705 net.go:770] primary dev: ETH0
I0319 19:09:43.423058 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:09:43.423072 543705 net.go:698] Add success.
I0319 19:09:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:09:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:09:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:09:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:09:53.409783 543705 cpu.go:275] no items to output this cycle
I0319 19:09:53.409785 543705 memory.go:184] no items to output this cycle
E0319 19:10:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:10:03.409819 543705 memory.go:184] no items to output this cycle
I0319 19:10:03.409830 543705 cpu.go:275] no items to output this cycle
E0319 19:10:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:10:13.409798 543705 memory.go:191] Add success.
I0319 19:10:13.409799 543705 cpu.go:282] Add success.
W0319 19:10:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:10:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:10:13.409851 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:10:13.420073 543705 net.go:648] Add success.
I0319 19:10:13.423068 543705 net.go:770] primary dev: ETH0
I0319 19:10:13.423080 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:10:13.423092 543705 net.go:698] Add success.
I0319 19:10:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:10:14.455201 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:10:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0319 19:10:14.455214 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:10:14.456605 543705 disk_worker.go:494] system disk:vda1
I0319 19:10:14.456637 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:10:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:10:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:10:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:10:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:10:16.472448 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:10:20.576516 543705 disk_info.go:125] begin check local disk info of client
I0319 19:10:20.579003 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:10:20.579009 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a0b80 0xc0002a0bc0]
E0319 19:10:23.409877 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:10:23.409898 543705 memory.go:184] no items to output this cycle
I0319 19:10:23.410045 543705 cpu.go:275] no items to output this cycle
E0319 19:10:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:10:33.409792 543705 memory.go:184] no items to output this cycle
I0319 19:10:33.409803 543705 cpu.go:275] no items to output this cycle
E0319 19:10:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:10:43.409792 543705 memory.go:191] Add success.
I0319 19:10:43.409792 543705 cpu.go:282] Add success.
I0319 19:10:43.420002 543705 net.go:648] Add success.
I0319 19:10:43.423182 543705 net.go:770] primary dev: ETH0
I0319 19:10:43.423197 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:10:43.423211 543705 net.go:698] Add success.
I0319 19:10:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:10:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:10:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:10:53.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:10:53.409821 543705 memory.go:184] no items to output this cycle
I0319 19:10:53.409831 543705 cpu.go:275] no items to output this cycle
E0319 19:11:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:11:03.409807 543705 memory.go:184] no items to output this cycle
I0319 19:11:03.409819 543705 cpu.go:275] no items to output this cycle
E0319 19:11:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:11:13.409786 543705 memory.go:191] Add success.
I0319 19:11:13.409807 543705 cpu.go:282] Add success.
W0319 19:11:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:11:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:11:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:11:13.420164 543705 net.go:648] Add success.
I0319 19:11:13.423028 543705 net.go:770] primary dev: ETH0
I0319 19:11:13.423042 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:11:13.423053 543705 net.go:698] Add success.
I0319 19:11:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:11:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:11:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0319 19:11:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:11:14.456633 543705 disk_worker.go:494] system disk:vda1
I0319 19:11:14.456665 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:11:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:11:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:11:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:11:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:11:16.472383 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:11:20.579092 543705 disk_info.go:125] begin check local disk info of client
I0319 19:11:20.581544 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:11:20.581554 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bf640 0xc0002bf680]
E0319 19:11:23.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:11:23.409762 543705 memory.go:184] no items to output this cycle
I0319 19:11:23.409797 543705 cpu.go:275] no items to output this cycle
E0319 19:11:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:11:33.409806 543705 memory.go:184] no items to output this cycle
I0319 19:11:33.409820 543705 cpu.go:275] no items to output this cycle
E0319 19:11:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:11:43.409788 543705 memory.go:191] Add success.
I0319 19:11:43.409805 543705 cpu.go:282] Add success.
I0319 19:11:43.420105 543705 net.go:648] Add success.
I0319 19:11:43.422651 543705 net.go:770] primary dev: ETH0
I0319 19:11:43.422666 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:11:43.422681 543705 net.go:698] Add success.
I0319 19:11:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:11:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:11:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:11:53.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:11:53.409811 543705 memory.go:184] no items to output this cycle
I0319 19:11:53.409821 543705 cpu.go:275] no items to output this cycle
E0319 19:12:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:12:03.409782 543705 memory.go:184] no items to output this cycle
I0319 19:12:03.409799 543705 cpu.go:275] no items to output this cycle
E0319 19:12:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:12:13.409805 543705 memory.go:191] Add success.
I0319 19:12:13.409813 543705 cpu.go:282] Add success.
W0319 19:12:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:12:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:12:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:12:13.420152 543705 net.go:648] Add success.
I0319 19:12:13.422868 543705 net.go:770] primary dev: ETH0
I0319 19:12:13.422883 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:12:13.422896 543705 net.go:698] Add success.
I0319 19:12:13.469047 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"192f51d1-c95c-4ba0-8546-a9d0508c6517","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 19:12:13.469080 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 19:12:14.455230 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:12:14.455246 543705 disk_worker.go:708] disk space is not compliant
W0319 19:12:14.455250 543705 disk_worker.go:728] disk inode is not compliant
E0319 19:12:14.455921 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 19:12:14.455931 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 19:12:14.455937 543705 custom_config.go:64] query custom config with name: gpu
I0319 19:12:14.456836 543705 disk_worker.go:494] system disk:vda1
I0319 19:12:14.456864 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 19:12:15.456809 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 19:12:15.456819 543705 custom_config.go:64] query custom config with name: huawei_npu
E0319 19:12:16.457932 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 19:12:16.457932 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:12:16.457998 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:12:16.458018 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:12:16.472357 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:12:20.582591 543705 disk_info.go:125] begin check local disk info of client
I0319 19:12:20.584932 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:12:20.584937 543705 disk_info.go:196] parse disk info done, disk is : [0xc00055af00 0xc00055af40]
E0319 19:12:23.410236 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:12:23.410249 543705 cpu.go:275] no items to output this cycle
I0319 19:12:23.410252 543705 memory.go:184] no items to output this cycle
E0319 19:12:33.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:12:33.409909 543705 memory.go:184] no items to output this cycle
I0319 19:12:33.409994 543705 cpu.go:275] no items to output this cycle
I0319 19:12:37.822046 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 19:12:37.822053 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 19:12:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:12:43.410742 543705 memory.go:191] Add success.
I0319 19:12:43.409812 543705 cpu.go:282] Add success.
I0319 19:12:43.420439 543705 net.go:648] Add success.
I0319 19:12:43.423366 543705 net.go:770] primary dev: ETH0
I0319 19:12:43.423381 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:12:43.423395 543705 net.go:698] Add success.
I0319 19:12:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:12:46.458064 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:12:46.458089 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:12:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:12:53.409781 543705 cpu.go:275] no items to output this cycle
I0319 19:12:53.409794 543705 memory.go:184] no items to output this cycle
E0319 19:13:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:13:03.409788 543705 cpu.go:275] no items to output this cycle
I0319 19:13:03.409789 543705 memory.go:184] no items to output this cycle
E0319 19:13:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:13:13.409786 543705 memory.go:191] Add success.
I0319 19:13:13.409787 543705 cpu.go:282] Add success.
W0319 19:13:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:13:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:13:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:13:13.420480 543705 net.go:648] Add success.
I0319 19:13:13.423135 543705 net.go:770] primary dev: ETH0
I0319 19:13:13.423149 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:13:13.423161 543705 net.go:698] Add success.
I0319 19:13:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:13:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:13:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0319 19:13:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:13:14.456598 543705 disk_worker.go:494] system disk:vda1
I0319 19:13:14.456629 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:13:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:13:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:13:16.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:13:16.458086 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:13:16.472460 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:13:20.585567 543705 disk_info.go:125] begin check local disk info of client
I0319 19:13:20.588041 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:13:20.588048 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003aab80 0xc0003aabc0]
E0319 19:13:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:13:23.409787 543705 memory.go:184] no items to output this cycle
I0319 19:13:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 19:13:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:13:33.409782 543705 memory.go:184] no items to output this cycle
I0319 19:13:33.409798 543705 cpu.go:275] no items to output this cycle
E0319 19:13:43.409840 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:13:43.409883 543705 memory.go:191] Add success.
I0319 19:13:43.409928 543705 cpu.go:282] Add success.
I0319 19:13:43.420088 543705 net.go:648] Add success.
I0319 19:13:43.422830 543705 net.go:770] primary dev: ETH0
I0319 19:13:43.422845 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:13:43.422859 543705 net.go:698] Add success.
I0319 19:13:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:13:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:13:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:13:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:13:53.409770 543705 memory.go:184] no items to output this cycle
I0319 19:13:53.409790 543705 cpu.go:275] no items to output this cycle
E0319 19:14:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:14:03.409792 543705 memory.go:184] no items to output this cycle
I0319 19:14:03.409790 543705 cpu.go:275] no items to output this cycle
E0319 19:14:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:14:13.409790 543705 cpu.go:282] Add success.
I0319 19:14:13.409797 543705 memory.go:191] Add success.
W0319 19:14:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:14:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:14:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:14:13.420115 543705 net.go:648] Add success.
I0319 19:14:13.422850 543705 net.go:770] primary dev: ETH0
I0319 19:14:13.422865 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:14:13.422878 543705 net.go:698] Add success.
I0319 19:14:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:14:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:14:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0319 19:14:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:14:14.456500 543705 disk_worker.go:494] system disk:vda1
I0319 19:14:14.456544 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:14:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:14:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:14:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:14:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:14:16.472390 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:14:20.588613 543705 disk_info.go:125] begin check local disk info of client
I0319 19:14:20.591116 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:14:20.591124 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ab880 0xc0003ab8c0]
E0319 19:14:23.410256 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:14:23.410274 543705 memory.go:184] no items to output this cycle
I0319 19:14:23.410287 543705 cpu.go:275] no items to output this cycle
E0319 19:14:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:14:33.409795 543705 cpu.go:275] no items to output this cycle
I0319 19:14:33.409797 543705 memory.go:184] no items to output this cycle
E0319 19:14:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:14:43.409782 543705 memory.go:191] Add success.
I0319 19:14:43.409812 543705 cpu.go:282] Add success.
I0319 19:14:43.419939 543705 net.go:648] Add success.
I0319 19:14:43.422821 543705 net.go:770] primary dev: ETH0
I0319 19:14:43.422834 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:14:43.422846 543705 net.go:698] Add success.
I0319 19:14:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:14:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:14:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:14:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:14:53.409773 543705 memory.go:184] no items to output this cycle
I0319 19:14:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 19:15:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:15:03.409778 543705 memory.go:184] no items to output this cycle
I0319 19:15:03.409779 543705 cpu.go:275] no items to output this cycle
E0319 19:15:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:15:13.409809 543705 memory.go:191] Add success.
I0319 19:15:13.409817 543705 cpu.go:282] Add success.
W0319 19:15:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:15:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:15:13.409860 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:15:13.420067 543705 net.go:648] Add success.
I0319 19:15:13.422770 543705 net.go:770] primary dev: ETH0
I0319 19:15:13.422782 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:15:13.422793 543705 net.go:698] Add success.
I0319 19:15:13.469987 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"24a55ab7-a49a-44ee-b44e-8956b1e9f05c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 19:15:13.470020 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 19:15:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:15:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:15:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0319 19:15:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:15:14.456678 543705 disk_worker.go:494] system disk:vda1
I0319 19:15:14.456707 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:15:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:15:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:15:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:15:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:15:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:15:20.591207 543705 disk_info.go:125] begin check local disk info of client
I0319 19:15:20.593700 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:15:20.593707 543705 disk_info.go:196] parse disk info done, disk is : [0xc00055b940 0xc00055b980]
E0319 19:15:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:15:23.409787 543705 memory.go:184] no items to output this cycle
I0319 19:15:23.409792 543705 cpu.go:275] no items to output this cycle
E0319 19:15:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:15:33.409883 543705 memory.go:184] no items to output this cycle
I0319 19:15:33.409949 543705 cpu.go:275] no items to output this cycle
I0319 19:15:37.823806 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 19:15:37.823813 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 19:15:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:15:43.410620 543705 memory.go:191] Add success.
I0319 19:15:43.409804 543705 cpu.go:282] Add success.
I0319 19:15:43.420132 543705 net.go:770] primary dev: ETH0
I0319 19:15:43.420145 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:15:43.420157 543705 net.go:698] Add success.
I0319 19:15:43.420518 543705 net.go:648] Add success.
I0319 19:15:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:15:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:15:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:15:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:15:53.409800 543705 memory.go:184] no items to output this cycle
I0319 19:15:53.409814 543705 cpu.go:275] no items to output this cycle
E0319 19:16:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:16:03.409782 543705 cpu.go:275] no items to output this cycle
I0319 19:16:03.409785 543705 memory.go:184] no items to output this cycle
E0319 19:16:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:16:13.409814 543705 memory.go:191] Add success.
I0319 19:16:13.409815 543705 cpu.go:282] Add success.
W0319 19:16:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:16:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:16:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:16:13.420172 543705 net.go:648] Add success.
I0319 19:16:13.423519 543705 net.go:770] primary dev: ETH0
I0319 19:16:13.423534 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:16:13.423548 543705 net.go:698] Add success.
I0319 19:16:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:16:14.455174 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:16:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0319 19:16:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:16:14.456573 543705 disk_worker.go:494] system disk:vda1
I0319 19:16:14.456601 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:16:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:16:16.457962 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:16:16.458024 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:16:16.458044 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:16:16.472422 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:16:20.593789 543705 disk_info.go:125] begin check local disk info of client
I0319 19:16:20.596243 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:16:20.596249 543705 disk_info.go:196] parse disk info done, disk is : [0xc000260340 0xc000260380]
E0319 19:16:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:16:23.409781 543705 cpu.go:275] no items to output this cycle
I0319 19:16:23.409787 543705 memory.go:184] no items to output this cycle
E0319 19:16:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:16:33.409804 543705 memory.go:184] no items to output this cycle
I0319 19:16:33.409813 543705 cpu.go:275] no items to output this cycle
E0319 19:16:43.409883 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:16:43.409913 543705 memory.go:191] Add success.
I0319 19:16:43.410020 543705 cpu.go:282] Add success.
I0319 19:16:43.419748 543705 net.go:648] Add success.
I0319 19:16:43.422578 543705 net.go:770] primary dev: ETH0
I0319 19:16:43.422591 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:16:43.422602 543705 net.go:698] Add success.
I0319 19:16:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:16:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:16:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:16:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:16:53.409810 543705 memory.go:184] no items to output this cycle
I0319 19:16:53.409814 543705 cpu.go:275] no items to output this cycle
E0319 19:17:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:17:03.409776 543705 memory.go:184] no items to output this cycle
I0319 19:17:03.409783 543705 cpu.go:275] no items to output this cycle
E0319 19:17:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:17:13.409786 543705 memory.go:191] Add success.
I0319 19:17:13.409791 543705 cpu.go:282] Add success.
W0319 19:17:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:17:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:17:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:17:13.420127 543705 net.go:648] Add success.
I0319 19:17:13.422798 543705 net.go:770] primary dev: ETH0
I0319 19:17:13.422811 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:17:13.422823 543705 net.go:698] Add success.
I0319 19:17:13.453358 543705 event_worker.go:152] Polling the log file for events...
W0319 19:17:14.455128 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:17:14.455139 543705 disk_worker.go:708] disk space is not compliant
W0319 19:17:14.455142 543705 disk_worker.go:728] disk inode is not compliant
E0319 19:17:14.456868 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 19:17:14.456877 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 19:17:14.456884 543705 custom_config.go:64] query custom config with name: gpu
I0319 19:17:14.456956 543705 disk_worker.go:494] system disk:vda1
I0319 19:17:14.456997 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 19:17:15.456852 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 19:17:15.456861 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:17:16.457921 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 19:17:16.457921 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 19:17:16.457978 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:17:16.457998 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:17:16.472326 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:17:20.596619 543705 disk_info.go:125] begin check local disk info of client
I0319 19:17:20.599047 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:17:20.599053 543705 disk_info.go:196] parse disk info done, disk is : [0xc00029f5c0 0xc00029f600]
E0319 19:17:23.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:17:23.409819 543705 memory.go:184] no items to output this cycle
I0319 19:17:23.409832 543705 cpu.go:275] no items to output this cycle
E0319 19:17:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:17:33.409785 543705 memory.go:184] no items to output this cycle
I0319 19:17:33.409785 543705 cpu.go:275] no items to output this cycle
E0319 19:17:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:17:43.409794 543705 memory.go:191] Add success.
I0319 19:17:43.409810 543705 cpu.go:282] Add success.
I0319 19:17:43.419900 543705 net.go:648] Add success.
I0319 19:17:43.422517 543705 net.go:770] primary dev: ETH0
I0319 19:17:43.422529 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:17:43.422542 543705 net.go:698] Add success.
I0319 19:17:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:17:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:17:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:17:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:17:53.409777 543705 memory.go:184] no items to output this cycle
I0319 19:17:53.409783 543705 cpu.go:275] no items to output this cycle
E0319 19:18:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:18:03.409811 543705 memory.go:184] no items to output this cycle
I0319 19:18:03.409829 543705 cpu.go:275] no items to output this cycle
E0319 19:18:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:18:13.409808 543705 memory.go:191] Add success.
I0319 19:18:13.409815 543705 cpu.go:282] Add success.
W0319 19:18:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:18:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:18:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:18:13.420117 543705 net.go:648] Add success.
I0319 19:18:13.422807 543705 net.go:770] primary dev: ETH0
I0319 19:18:13.422822 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:18:13.422834 543705 net.go:698] Add success.
I0319 19:18:13.469252 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8c9014a2-1bc3-4abb-bf36-4896aecedc30","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 19:18:13.469286 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 19:18:14.453933 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:18:14.455226 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:18:14.455235 543705 disk_worker.go:708] disk space is not compliant
W0319 19:18:14.455238 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:18:14.456604 543705 disk_worker.go:494] system disk:vda1
I0319 19:18:14.456661 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:18:15.455974 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:18:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:18:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:18:16.458077 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:18:16.472437 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:18:20.599640 543705 disk_info.go:125] begin check local disk info of client
I0319 19:18:20.602140 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:18:20.602145 543705 disk_info.go:196] parse disk info done, disk is : [0xc00029f000 0xc00029f040]
E0319 19:18:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:18:23.409795 543705 memory.go:184] no items to output this cycle
I0319 19:18:23.409810 543705 cpu.go:275] no items to output this cycle
I0319 19:18:33.409914 543705 cpu.go:275] no items to output this cycle
E0319 19:18:33.409962 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:18:33.409974 543705 memory.go:184] no items to output this cycle
I0319 19:18:37.825732 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 19:18:37.825738 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 19:18:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:18:43.410757 543705 memory.go:191] Add success.
I0319 19:18:43.409803 543705 cpu.go:282] Add success.
I0319 19:18:43.420540 543705 net.go:648] Add success.
I0319 19:18:43.423278 543705 net.go:770] primary dev: ETH0
I0319 19:18:43.423290 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:18:43.423303 543705 net.go:698] Add success.
I0319 19:18:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:18:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:18:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:18:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:18:53.409772 543705 memory.go:184] no items to output this cycle
I0319 19:18:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 19:19:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:19:03.409781 543705 memory.go:184] no items to output this cycle
I0319 19:19:03.409782 543705 cpu.go:275] no items to output this cycle
E0319 19:19:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:19:13.409815 543705 memory.go:191] Add success.
I0319 19:19:13.409827 543705 cpu.go:282] Add success.
W0319 19:19:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:19:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:19:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:19:13.420137 543705 net.go:648] Add success.
I0319 19:19:13.422839 543705 net.go:770] primary dev: ETH0
I0319 19:19:13.422858 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:19:13.422873 543705 net.go:698] Add success.
I0319 19:19:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:19:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:19:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 19:19:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:19:14.456579 543705 disk_worker.go:494] system disk:vda1
I0319 19:19:14.456608 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:19:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:19:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:19:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:19:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:19:16.472437 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:19:20.602225 543705 disk_info.go:125] begin check local disk info of client
I0319 19:19:20.604688 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:19:20.604694 543705 disk_info.go:196] parse disk info done, disk is : [0xc00024ad80 0xc00024adc0]
I0319 19:19:23.409776 543705 cpu.go:275] no items to output this cycle
E0319 19:19:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:19:23.409793 543705 memory.go:184] no items to output this cycle
E0319 19:19:33.409867 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:19:33.409887 543705 memory.go:184] no items to output this cycle
I0319 19:19:33.409948 543705 cpu.go:275] no items to output this cycle
E0319 19:19:43.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:19:43.409825 543705 memory.go:191] Add success.
I0319 19:19:43.409838 543705 cpu.go:282] Add success.
I0319 19:19:43.419782 543705 net.go:770] primary dev: ETH0
I0319 19:19:43.419797 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:19:43.419812 543705 net.go:698] Add success.
I0319 19:19:43.420166 543705 net.go:648] Add success.
I0319 19:19:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:19:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:19:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:19:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:19:53.409785 543705 memory.go:184] no items to output this cycle
I0319 19:19:53.409786 543705 cpu.go:275] no items to output this cycle
E0319 19:20:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:20:03.409780 543705 memory.go:184] no items to output this cycle
I0319 19:20:03.409804 543705 cpu.go:275] no items to output this cycle
E0319 19:20:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:20:13.409819 543705 memory.go:191] Add success.
I0319 19:20:13.409832 543705 cpu.go:282] Add success.
W0319 19:20:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:20:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:20:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:20:13.420133 543705 net.go:648] Add success.
I0319 19:20:13.422824 543705 net.go:770] primary dev: ETH0
I0319 19:20:13.422840 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:20:13.422851 543705 net.go:698] Add success.
I0319 19:20:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:20:14.455116 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:20:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0319 19:20:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:20:14.456572 543705 disk_worker.go:494] system disk:vda1
I0319 19:20:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:20:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:20:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:20:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:20:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:20:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:20:20.605671 543705 disk_info.go:125] begin check local disk info of client
I0319 19:20:20.608097 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:20:20.608104 543705 disk_info.go:196] parse disk info done, disk is : [0xc000230700 0xc000230740]
E0319 19:20:23.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:20:23.409821 543705 memory.go:184] no items to output this cycle
I0319 19:20:23.409829 543705 cpu.go:275] no items to output this cycle
E0319 19:20:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:20:33.409797 543705 memory.go:184] no items to output this cycle
I0319 19:20:33.409804 543705 cpu.go:275] no items to output this cycle
E0319 19:20:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:20:43.409800 543705 memory.go:191] Add success.
I0319 19:20:43.409827 543705 cpu.go:282] Add success.
I0319 19:20:43.419951 543705 net.go:648] Add success.
I0319 19:20:43.423139 543705 net.go:770] primary dev: ETH0
I0319 19:20:43.423153 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:20:43.423165 543705 net.go:698] Add success.
I0319 19:20:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:20:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:20:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:20:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:20:53.409784 543705 memory.go:184] no items to output this cycle
I0319 19:20:53.409814 543705 cpu.go:275] no items to output this cycle
E0319 19:21:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:21:03.409805 543705 memory.go:184] no items to output this cycle
I0319 19:21:03.409818 543705 cpu.go:275] no items to output this cycle
E0319 19:21:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:21:13.409825 543705 memory.go:191] Add success.
I0319 19:21:13.409826 543705 cpu.go:282] Add success.
W0319 19:21:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:21:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:21:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:21:13.420144 543705 net.go:648] Add success.
I0319 19:21:13.422931 543705 net.go:770] primary dev: ETH0
I0319 19:21:13.422944 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:21:13.422956 543705 net.go:698] Add success.
I0319 19:21:13.464792 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5ee7e12a-779f-4bae-9ad4-33dca58a1862","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 19:21:13.464828 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 19:21:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:21:14.455094 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:21:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0319 19:21:14.455161 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:21:14.456483 543705 disk_worker.go:494] system disk:vda1
I0319 19:21:14.456514 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:21:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:21:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:21:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:21:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:21:16.472420 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:21:20.608673 543705 disk_info.go:125] begin check local disk info of client
I0319 19:21:20.611147 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:21:20.611154 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001f7f00 0xc0001f7f40]
E0319 19:21:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:21:23.409779 543705 memory.go:184] no items to output this cycle
I0319 19:21:23.409791 543705 cpu.go:275] no items to output this cycle
E0319 19:21:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:21:33.409782 543705 memory.go:184] no items to output this cycle
I0319 19:21:33.409788 543705 cpu.go:275] no items to output this cycle
I0319 19:21:37.827850 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 19:21:37.827856 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 19:21:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:21:43.410719 543705 memory.go:191] Add success.
I0319 19:21:43.409812 543705 cpu.go:282] Add success.
I0319 19:21:43.420412 543705 net.go:648] Add success.
I0319 19:21:43.423093 543705 net.go:770] primary dev: ETH0
I0319 19:21:43.423108 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:21:43.423123 543705 net.go:698] Add success.
I0319 19:21:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:21:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:21:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:21:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:21:53.409799 543705 memory.go:184] no items to output this cycle
I0319 19:21:53.409808 543705 cpu.go:275] no items to output this cycle
E0319 19:22:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:22:03.409769 543705 memory.go:184] no items to output this cycle
I0319 19:22:03.409805 543705 cpu.go:275] no items to output this cycle
E0319 19:22:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:22:13.409811 543705 memory.go:191] Add success.
I0319 19:22:13.409822 543705 cpu.go:282] Add success.
W0319 19:22:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:22:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:22:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:22:13.420672 543705 net.go:648] Add success.
I0319 19:22:13.423681 543705 net.go:770] primary dev: ETH0
I0319 19:22:13.423695 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:22:13.423710 543705 net.go:698] Add success.
W0319 19:22:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:22:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0319 19:22:14.455184 543705 disk_worker.go:728] disk inode is not compliant
E0319 19:22:14.456942 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 19:22:14.456951 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 19:22:14.456957 543705 custom_config.go:64] query custom config with name: gpu
I0319 19:22:14.457016 543705 disk_worker.go:494] system disk:vda1
I0319 19:22:14.457047 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 19:22:15.456832 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 19:22:15.456840 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:22:16.457917 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 19:22:16.457917 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 19:22:16.457974 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:22:16.457993 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:22:16.472316 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:22:20.611238 543705 disk_info.go:125] begin check local disk info of client
I0319 19:22:20.613707 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:22:20.613715 543705 disk_info.go:196] parse disk info done, disk is : [0xc0005786c0 0xc000578700]
E0319 19:22:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:22:23.409770 543705 memory.go:184] no items to output this cycle
I0319 19:22:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 19:22:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:22:33.409789 543705 memory.go:184] no items to output this cycle
I0319 19:22:33.409793 543705 cpu.go:275] no items to output this cycle
E0319 19:22:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:22:43.409786 543705 memory.go:191] Add success.
I0319 19:22:43.409812 543705 cpu.go:282] Add success.
I0319 19:22:43.419970 543705 net.go:648] Add success.
I0319 19:22:43.422575 543705 net.go:770] primary dev: ETH0
I0319 19:22:43.422588 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:22:43.422600 543705 net.go:698] Add success.
I0319 19:22:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:22:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:22:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:22:53.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:22:53.409810 543705 memory.go:184] no items to output this cycle
I0319 19:22:53.409818 543705 cpu.go:275] no items to output this cycle
E0319 19:23:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:23:03.409804 543705 memory.go:184] no items to output this cycle
I0319 19:23:03.409816 543705 cpu.go:275] no items to output this cycle
E0319 19:23:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:23:13.409781 543705 memory.go:191] Add success.
I0319 19:23:13.409804 543705 cpu.go:282] Add success.
W0319 19:23:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:23:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:23:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:23:13.420136 543705 net.go:648] Add success.
I0319 19:23:13.422996 543705 net.go:770] primary dev: ETH0
I0319 19:23:13.423009 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:23:13.423021 543705 net.go:698] Add success.
I0319 19:23:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:23:14.455164 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:23:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0319 19:23:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:23:14.456507 543705 disk_worker.go:494] system disk:vda1
I0319 19:23:14.456551 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:23:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:23:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:23:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:23:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:23:16.472459 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:23:20.613801 543705 disk_info.go:125] begin check local disk info of client
I0319 19:23:20.616272 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:23:20.616279 543705 disk_info.go:196] parse disk info done, disk is : [0xc000271180 0xc0002711c0]
E0319 19:23:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:23:23.409798 543705 memory.go:184] no items to output this cycle
I0319 19:23:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 19:23:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:23:33.409773 543705 memory.go:184] no items to output this cycle
I0319 19:23:33.409803 543705 cpu.go:275] no items to output this cycle
E0319 19:23:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:23:43.409799 543705 memory.go:191] Add success.
I0319 19:23:43.409814 543705 cpu.go:282] Add success.
I0319 19:23:43.420084 543705 net.go:648] Add success.
I0319 19:23:43.422662 543705 net.go:770] primary dev: ETH0
I0319 19:23:43.422677 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:23:43.422688 543705 net.go:698] Add success.
I0319 19:23:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:23:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:23:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:23:53.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:23:53.409809 543705 memory.go:184] no items to output this cycle
I0319 19:23:53.409817 543705 cpu.go:275] no items to output this cycle
E0319 19:24:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:24:03.409782 543705 memory.go:184] no items to output this cycle
I0319 19:24:03.409789 543705 cpu.go:275] no items to output this cycle
E0319 19:24:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:24:13.409788 543705 memory.go:191] Add success.
I0319 19:24:13.409796 543705 cpu.go:282] Add success.
W0319 19:24:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:24:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:24:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:24:13.420070 543705 net.go:648] Add success.
I0319 19:24:13.422899 543705 net.go:770] primary dev: ETH0
I0319 19:24:13.422913 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:24:13.422924 543705 net.go:698] Add success.
I0319 19:24:13.468266 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"88c117f4-2869-4bc9-a261-88e5296e8666","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 19:24:13.468301 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 19:24:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:24:14.455169 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:24:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0319 19:24:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:24:14.456544 543705 disk_worker.go:494] system disk:vda1
I0319 19:24:14.456601 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:24:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:24:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:24:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:24:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:24:16.472408 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:24:20.616725 543705 disk_info.go:125] begin check local disk info of client
I0319 19:24:20.619335 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:24:20.619341 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b6f80 0xc0003b6fc0]
E0319 19:24:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:24:23.409767 543705 memory.go:184] no items to output this cycle
I0319 19:24:23.409796 543705 cpu.go:275] no items to output this cycle
E0319 19:24:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:24:33.409780 543705 memory.go:184] no items to output this cycle
I0319 19:24:33.409817 543705 cpu.go:275] no items to output this cycle
I0319 19:24:37.829740 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 19:24:37.829747 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 19:24:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:24:43.410668 543705 memory.go:191] Add success.
I0319 19:24:43.409809 543705 cpu.go:282] Add success.
I0319 19:24:43.420382 543705 net.go:648] Add success.
I0319 19:24:43.423047 543705 net.go:770] primary dev: ETH0
I0319 19:24:43.423071 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:24:43.423084 543705 net.go:698] Add success.
I0319 19:24:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:24:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:24:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:24:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:24:53.409787 543705 memory.go:184] no items to output this cycle
I0319 19:24:53.409800 543705 cpu.go:275] no items to output this cycle
E0319 19:25:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:25:03.409797 543705 memory.go:184] no items to output this cycle
I0319 19:25:03.409810 543705 cpu.go:275] no items to output this cycle
E0319 19:25:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:25:13.409795 543705 memory.go:191] Add success.
I0319 19:25:13.409798 543705 cpu.go:282] Add success.
W0319 19:25:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:25:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:25:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:25:13.420147 543705 net.go:648] Add success.
I0319 19:25:13.422902 543705 net.go:770] primary dev: ETH0
I0319 19:25:13.422915 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:25:13.422927 543705 net.go:698] Add success.
I0319 19:25:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:25:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:25:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0319 19:25:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:25:14.456573 543705 disk_worker.go:494] system disk:vda1
I0319 19:25:14.456605 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:25:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:25:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:25:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:25:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:25:16.472405 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:25:20.619735 543705 disk_info.go:125] begin check local disk info of client
I0319 19:25:20.622221 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:25:20.622228 543705 disk_info.go:196] parse disk info done, disk is : [0xc000250100 0xc000250140]
E0319 19:25:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:25:23.409803 543705 memory.go:184] no items to output this cycle
I0319 19:25:23.409808 543705 cpu.go:275] no items to output this cycle
E0319 19:25:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:25:33.409777 543705 memory.go:184] no items to output this cycle
I0319 19:25:33.409899 543705 cpu.go:275] no items to output this cycle
E0319 19:25:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:25:43.409801 543705 memory.go:191] Add success.
I0319 19:25:43.409801 543705 cpu.go:282] Add success.
I0319 19:25:43.419984 543705 net.go:648] Add success.
I0319 19:25:43.422655 543705 net.go:770] primary dev: ETH0
I0319 19:25:43.422670 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:25:43.422683 543705 net.go:698] Add success.
I0319 19:25:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:25:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:25:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:25:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:25:53.409764 543705 memory.go:184] no items to output this cycle
I0319 19:25:53.409802 543705 cpu.go:275] no items to output this cycle
E0319 19:26:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:26:03.409779 543705 memory.go:184] no items to output this cycle
I0319 19:26:03.409785 543705 cpu.go:275] no items to output this cycle
E0319 19:26:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:26:13.409816 543705 memory.go:191] Add success.
I0319 19:26:13.409828 543705 cpu.go:282] Add success.
W0319 19:26:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:26:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:26:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:26:13.420167 543705 net.go:648] Add success.
I0319 19:26:13.422888 543705 net.go:770] primary dev: ETH0
I0319 19:26:13.422900 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:26:13.422912 543705 net.go:698] Add success.
I0319 19:26:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:26:14.455102 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:26:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0319 19:26:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:26:14.456507 543705 disk_worker.go:494] system disk:vda1
I0319 19:26:14.456551 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:26:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:26:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:26:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:26:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:26:16.472436 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:26:20.622754 543705 disk_info.go:125] begin check local disk info of client
I0319 19:26:20.625265 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:26:20.625271 543705 disk_info.go:196] parse disk info done, disk is : [0xc00024fc40 0xc00024fc80]
E0319 19:26:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:26:23.409791 543705 memory.go:184] no items to output this cycle
I0319 19:26:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 19:26:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:26:33.409808 543705 memory.go:184] no items to output this cycle
I0319 19:26:33.409821 543705 cpu.go:275] no items to output this cycle
E0319 19:26:43.409867 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:26:43.409901 543705 memory.go:191] Add success.
I0319 19:26:43.409947 543705 cpu.go:282] Add success.
I0319 19:26:43.419726 543705 net.go:648] Add success.
I0319 19:26:43.422558 543705 net.go:770] primary dev: ETH0
I0319 19:26:43.422573 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:26:43.422587 543705 net.go:698] Add success.
I0319 19:26:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:26:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:26:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:26:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:26:53.409790 543705 memory.go:184] no items to output this cycle
I0319 19:26:53.409801 543705 cpu.go:275] no items to output this cycle
E0319 19:27:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:27:03.409803 543705 memory.go:184] no items to output this cycle
I0319 19:27:03.409821 543705 cpu.go:275] no items to output this cycle
E0319 19:27:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:27:13.409795 543705 memory.go:191] Add success.
I0319 19:27:13.409807 543705 cpu.go:282] Add success.
W0319 19:27:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:27:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:27:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:27:13.419947 543705 net.go:770] primary dev: ETH0
I0319 19:27:13.419960 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:27:13.419972 543705 net.go:698] Add success.
I0319 19:27:13.420359 543705 net.go:648] Add success.
I0319 19:27:13.429808 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 19:27:13.452979 543705 event_worker.go:152] Polling the log file for events...
I0319 19:27:13.468636 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"910bfc41-758e-46d5-aa69-c65da3b2cb59","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 19:27:13.468671 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 19:27:14.455158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:27:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0319 19:27:14.455170 543705 disk_worker.go:728] disk inode is not compliant
E0319 19:27:14.456111 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 19:27:14.456120 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 19:27:14.456126 543705 custom_config.go:64] query custom config with name: gpu
I0319 19:27:14.456518 543705 disk_worker.go:494] system disk:vda1
I0319 19:27:14.456573 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 19:27:15.456562 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 19:27:15.456576 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:27:16.457998 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 19:27:16.458005 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 19:27:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:27:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:27:16.472437 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:27:20.625673 543705 disk_info.go:125] begin check local disk info of client
I0319 19:27:20.628175 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:27:20.628182 543705 disk_info.go:196] parse disk info done, disk is : [0xc000292d80 0xc000292dc0]
E0319 19:27:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:27:23.409780 543705 memory.go:184] no items to output this cycle
I0319 19:27:23.409781 543705 cpu.go:275] no items to output this cycle
E0319 19:27:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:27:33.409777 543705 memory.go:184] no items to output this cycle
I0319 19:27:33.409806 543705 cpu.go:275] no items to output this cycle
I0319 19:27:37.829889 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 19:27:37.829896 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
I0319 19:27:43.409918 543705 cpu.go:282] Add success.
E0319 19:27:43.409940 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:27:43.410761 543705 memory.go:191] Add success.
I0319 19:27:43.419717 543705 net.go:648] Add success.
I0319 19:27:43.422563 543705 net.go:770] primary dev: ETH0
I0319 19:27:43.422591 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:27:43.422604 543705 net.go:698] Add success.
I0319 19:27:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:27:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:27:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:27:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:27:53.409767 543705 memory.go:184] no items to output this cycle
I0319 19:27:53.409801 543705 cpu.go:275] no items to output this cycle
E0319 19:28:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:28:03.409780 543705 memory.go:184] no items to output this cycle
I0319 19:28:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 19:28:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:28:13.409811 543705 memory.go:191] Add success.
I0319 19:28:13.409817 543705 cpu.go:282] Add success.
W0319 19:28:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:28:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:28:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:28:13.420067 543705 net.go:648] Add success.
I0319 19:28:13.422976 543705 net.go:770] primary dev: ETH0
I0319 19:28:13.422989 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:28:13.423001 543705 net.go:698] Add success.
I0319 19:28:14.454978 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:28:14.455196 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:28:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0319 19:28:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:28:14.456632 543705 disk_worker.go:494] system disk:vda1
I0319 19:28:14.456666 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:28:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:28:16.458001 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:28:16.458062 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:28:16.458086 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:28:16.472514 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:28:20.628265 543705 disk_info.go:125] begin check local disk info of client
I0319 19:28:20.630869 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:28:20.630876 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001cc380 0xc0001cc3c0]
E0319 19:28:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:28:23.409775 543705 cpu.go:275] no items to output this cycle
I0319 19:28:23.409787 543705 memory.go:184] no items to output this cycle
E0319 19:28:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:28:33.409808 543705 memory.go:184] no items to output this cycle
I0319 19:28:33.409824 543705 cpu.go:275] no items to output this cycle
E0319 19:28:43.409878 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:28:43.409904 543705 memory.go:191] Add success.
I0319 19:28:43.409962 543705 cpu.go:282] Add success.
I0319 19:28:43.419713 543705 net.go:648] Add success.
I0319 19:28:43.422528 543705 net.go:770] primary dev: ETH0
I0319 19:28:43.422541 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:28:43.422552 543705 net.go:698] Add success.
I0319 19:28:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:28:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:28:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:28:53.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:28:53.409814 543705 memory.go:184] no items to output this cycle
I0319 19:28:53.409820 543705 cpu.go:275] no items to output this cycle
E0319 19:29:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:29:03.409777 543705 memory.go:184] no items to output this cycle
I0319 19:29:03.409790 543705 cpu.go:275] no items to output this cycle
E0319 19:29:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:29:13.409814 543705 memory.go:191] Add success.
I0319 19:29:13.409818 543705 cpu.go:282] Add success.
W0319 19:29:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:29:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:29:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:29:13.420058 543705 net.go:648] Add success.
I0319 19:29:13.423160 543705 net.go:770] primary dev: ETH0
I0319 19:29:13.423175 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:29:13.423187 543705 net.go:698] Add success.
I0319 19:29:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:29:14.455143 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:29:14.455153 543705 disk_worker.go:708] disk space is not compliant
W0319 19:29:14.455155 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:29:14.456505 543705 disk_worker.go:494] system disk:vda1
I0319 19:29:14.456548 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:29:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:29:16.457995 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:29:16.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:29:16.458086 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:29:16.472420 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:29:20.631794 543705 disk_info.go:125] begin check local disk info of client
I0319 19:29:20.634225 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:29:20.634231 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001f8c00 0xc0001f8c40]
E0319 19:29:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:29:23.409793 543705 memory.go:184] no items to output this cycle
I0319 19:29:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 19:29:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:29:33.409780 543705 memory.go:184] no items to output this cycle
I0319 19:29:33.409797 543705 cpu.go:275] no items to output this cycle
E0319 19:29:43.409869 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:29:43.409897 543705 memory.go:191] Add success.
I0319 19:29:43.409990 543705 cpu.go:282] Add success.
I0319 19:29:43.419551 543705 net.go:770] primary dev: ETH0
I0319 19:29:43.419564 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:29:43.419576 543705 net.go:698] Add success.
I0319 19:29:43.419888 543705 net.go:648] Add success.
I0319 19:29:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:29:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:29:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:29:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:29:53.409772 543705 memory.go:184] no items to output this cycle
I0319 19:29:53.409796 543705 cpu.go:275] no items to output this cycle
E0319 19:30:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:30:03.409777 543705 memory.go:184] no items to output this cycle
I0319 19:30:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 19:30:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:30:13.409820 543705 memory.go:191] Add success.
I0319 19:30:13.409824 543705 cpu.go:282] Add success.
W0319 19:30:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:30:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:30:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:30:13.420145 543705 net.go:648] Add success.
I0319 19:30:13.422874 543705 net.go:770] primary dev: ETH0
I0319 19:30:13.422887 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:30:13.422899 543705 net.go:698] Add success.
I0319 19:30:13.469150 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8ce4faa5-b1a3-4778-9f74-223254278640","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 19:30:13.469194 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 19:30:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:30:14.455118 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:30:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 19:30:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:30:14.456690 543705 disk_worker.go:494] system disk:vda1
I0319 19:30:14.456727 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:30:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:30:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:30:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:30:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:30:16.472386 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:30:20.634819 543705 disk_info.go:125] begin check local disk info of client
I0319 19:30:20.637230 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:30:20.637236 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b6d80 0xc0003b6dc0]
E0319 19:30:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:30:23.409797 543705 memory.go:184] no items to output this cycle
I0319 19:30:23.409813 543705 cpu.go:275] no items to output this cycle
E0319 19:30:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:30:33.409803 543705 memory.go:184] no items to output this cycle
I0319 19:30:33.409815 543705 cpu.go:275] no items to output this cycle
I0319 19:30:37.830031 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 19:30:37.830038 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 19:30:43.409861 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:30:43.410761 543705 memory.go:191] Add success.
I0319 19:30:43.409897 543705 cpu.go:282] Add success.
I0319 19:30:43.419744 543705 net.go:648] Add success.
I0319 19:30:43.422350 543705 net.go:770] primary dev: ETH0
I0319 19:30:43.422365 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:30:43.422379 543705 net.go:698] Add success.
I0319 19:30:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:30:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:30:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:30:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:30:53.409774 543705 memory.go:184] no items to output this cycle
I0319 19:30:53.409804 543705 cpu.go:275] no items to output this cycle
E0319 19:31:03.410530 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:31:03.410552 543705 memory.go:184] no items to output this cycle
I0319 19:31:03.410575 543705 cpu.go:275] no items to output this cycle
E0319 19:31:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:31:13.409795 543705 memory.go:191] Add success.
I0319 19:31:13.409794 543705 cpu.go:282] Add success.
W0319 19:31:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:31:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:31:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:31:13.420432 543705 net.go:648] Add success.
I0319 19:31:13.423158 543705 net.go:770] primary dev: ETH0
I0319 19:31:13.423172 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:31:13.423185 543705 net.go:698] Add success.
I0319 19:31:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:31:14.455168 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:31:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0319 19:31:14.455182 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:31:14.456558 543705 disk_worker.go:494] system disk:vda1
I0319 19:31:14.456601 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:31:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:31:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:31:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:31:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:31:16.472364 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:31:20.637670 543705 disk_info.go:125] begin check local disk info of client
I0319 19:31:20.640183 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:31:20.640190 543705 disk_info.go:196] parse disk info done, disk is : [0xc000463f00 0xc000463f40]
E0319 19:31:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:31:23.409768 543705 memory.go:184] no items to output this cycle
I0319 19:31:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 19:31:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:31:33.409776 543705 memory.go:184] no items to output this cycle
I0319 19:31:33.409795 543705 cpu.go:275] no items to output this cycle
E0319 19:31:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:31:43.409795 543705 memory.go:191] Add success.
I0319 19:31:43.409798 543705 cpu.go:282] Add success.
I0319 19:31:43.420268 543705 net.go:648] Add success.
I0319 19:31:43.422903 543705 net.go:770] primary dev: ETH0
I0319 19:31:43.422917 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:31:43.422931 543705 net.go:698] Add success.
I0319 19:31:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:31:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:31:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:31:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:31:53.409770 543705 memory.go:184] no items to output this cycle
I0319 19:31:53.409794 543705 cpu.go:275] no items to output this cycle
E0319 19:32:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:32:03.409807 543705 memory.go:184] no items to output this cycle
I0319 19:32:03.409821 543705 cpu.go:275] no items to output this cycle
E0319 19:32:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:32:13.409820 543705 memory.go:191] Add success.
I0319 19:32:13.409830 543705 cpu.go:282] Add success.
W0319 19:32:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:32:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:32:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:32:13.420145 543705 net.go:648] Add success.
I0319 19:32:13.422725 543705 net.go:770] primary dev: ETH0
I0319 19:32:13.422738 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:32:13.422750 543705 net.go:698] Add success.
W0319 19:32:14.455188 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:32:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0319 19:32:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:32:14.456831 543705 disk_worker.go:494] system disk:vda1
I0319 19:32:14.456872 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 19:32:14.457132 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 19:32:14.457140 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 19:32:14.457144 543705 custom_config.go:64] query custom config with name: gpu
E0319 19:32:15.456896 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 19:32:15.456905 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:32:16.457946 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 19:32:16.457955 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 19:32:16.458001 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:32:16.458018 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:32:16.472385 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:32:20.640836 543705 disk_info.go:125] begin check local disk info of client
I0319 19:32:20.643278 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:32:20.643284 543705 disk_info.go:196] parse disk info done, disk is : [0xc000466580 0xc0004665c0]
E0319 19:32:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:32:23.409802 543705 memory.go:184] no items to output this cycle
I0319 19:32:23.409816 543705 cpu.go:275] no items to output this cycle
E0319 19:32:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:32:33.409790 543705 memory.go:184] no items to output this cycle
I0319 19:32:33.409791 543705 cpu.go:275] no items to output this cycle
E0319 19:32:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:32:43.409819 543705 memory.go:191] Add success.
I0319 19:32:43.409828 543705 cpu.go:282] Add success.
I0319 19:32:43.419963 543705 net.go:648] Add success.
I0319 19:32:43.423304 543705 net.go:770] primary dev: ETH0
I0319 19:32:43.423317 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:32:43.423329 543705 net.go:698] Add success.
I0319 19:32:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:32:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:32:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:32:53.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:32:53.409817 543705 memory.go:184] no items to output this cycle
I0319 19:32:53.409826 543705 cpu.go:275] no items to output this cycle
E0319 19:33:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:33:03.409810 543705 memory.go:184] no items to output this cycle
I0319 19:33:03.409821 543705 cpu.go:275] no items to output this cycle
E0319 19:33:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:33:13.409809 543705 memory.go:191] Add success.
I0319 19:33:13.409809 543705 cpu.go:282] Add success.
W0319 19:33:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:33:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:33:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:33:13.420089 543705 net.go:648] Add success.
I0319 19:33:13.422974 543705 net.go:770] primary dev: ETH0
I0319 19:33:13.422988 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:33:13.423001 543705 net.go:698] Add success.
I0319 19:33:13.464204 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"bb616a8a-d6ad-48c1-8e3b-c9000694fe99","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 19:33:13.464237 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 19:33:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:33:14.455193 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:33:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0319 19:33:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:33:14.456624 543705 disk_worker.go:494] system disk:vda1
I0319 19:33:14.456654 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:33:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:33:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:33:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:33:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:33:16.472504 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:33:20.643368 543705 disk_info.go:125] begin check local disk info of client
I0319 19:33:20.645941 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:33:20.645949 543705 disk_info.go:196] parse disk info done, disk is : [0xc000464440 0xc000464480]
E0319 19:33:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:33:23.409780 543705 memory.go:184] no items to output this cycle
I0319 19:33:23.409813 543705 cpu.go:275] no items to output this cycle
E0319 19:33:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:33:33.409798 543705 memory.go:184] no items to output this cycle
I0319 19:33:33.409801 543705 cpu.go:275] no items to output this cycle
I0319 19:33:37.831874 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 19:33:37.831882 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 19:33:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:33:43.410846 543705 memory.go:191] Add success.
I0319 19:33:43.409816 543705 cpu.go:282] Add success.
I0319 19:33:43.420517 543705 net.go:648] Add success.
I0319 19:33:43.423693 543705 net.go:770] primary dev: ETH0
I0319 19:33:43.423704 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:33:43.423722 543705 net.go:698] Add success.
I0319 19:33:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:33:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:33:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:33:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:33:53.409776 543705 memory.go:184] no items to output this cycle
I0319 19:33:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 19:34:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:34:03.409782 543705 memory.go:184] no items to output this cycle
I0319 19:34:03.409785 543705 cpu.go:275] no items to output this cycle
W0319 19:34:13.409711 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:34:13.409727 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:34:13.409733 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:34:13.409797 543705 cpu.go:282] Add success.
E0319 19:34:13.409832 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:34:13.409856 543705 memory.go:191] Add success.
I0319 19:34:13.420222 543705 net.go:648] Add success.
I0319 19:34:13.423230 543705 net.go:770] primary dev: ETH0
I0319 19:34:13.423243 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:34:13.423256 543705 net.go:698] Add success.
I0319 19:34:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:34:14.455107 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:34:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0319 19:34:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:34:14.456522 543705 disk_worker.go:494] system disk:vda1
I0319 19:34:14.456567 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:34:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:34:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:34:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:34:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:34:16.472416 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:34:20.646032 543705 disk_info.go:125] begin check local disk info of client
I0319 19:34:20.648483 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:34:20.648490 543705 disk_info.go:196] parse disk info done, disk is : [0xc000465900 0xc000465940]
E0319 19:34:23.409740 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:34:23.409753 543705 memory.go:184] no items to output this cycle
I0319 19:34:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 19:34:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:34:33.409771 543705 memory.go:184] no items to output this cycle
I0319 19:34:33.409804 543705 cpu.go:275] no items to output this cycle
E0319 19:34:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:34:43.409806 543705 memory.go:191] Add success.
I0319 19:34:43.409817 543705 cpu.go:282] Add success.
I0319 19:34:43.420035 543705 net.go:648] Add success.
I0319 19:34:43.422583 543705 net.go:770] primary dev: ETH0
I0319 19:34:43.422596 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:34:43.422608 543705 net.go:698] Add success.
I0319 19:34:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:34:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:34:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:34:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:34:53.409778 543705 memory.go:184] no items to output this cycle
I0319 19:34:53.409791 543705 cpu.go:275] no items to output this cycle
E0319 19:35:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:35:03.409778 543705 memory.go:184] no items to output this cycle
I0319 19:35:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 19:35:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:35:13.409813 543705 memory.go:191] Add success.
I0319 19:35:13.409819 543705 cpu.go:282] Add success.
W0319 19:35:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:35:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:35:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:35:13.420216 543705 net.go:648] Add success.
I0319 19:35:13.422970 543705 net.go:770] primary dev: ETH0
I0319 19:35:13.422985 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:35:13.422997 543705 net.go:698] Add success.
I0319 19:35:14.454981 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:35:14.455124 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:35:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0319 19:35:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:35:14.456587 543705 disk_worker.go:494] system disk:vda1
I0319 19:35:14.456618 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:35:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:35:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:35:16.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:35:16.458078 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:35:16.472411 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:35:20.648878 543705 disk_info.go:125] begin check local disk info of client
I0319 19:35:20.651318 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:35:20.651324 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b7880 0xc0003b78c0]
E0319 19:35:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:35:23.409794 543705 memory.go:184] no items to output this cycle
I0319 19:35:23.409806 543705 cpu.go:275] no items to output this cycle
E0319 19:35:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:35:33.409786 543705 memory.go:184] no items to output this cycle
I0319 19:35:33.409791 543705 cpu.go:275] no items to output this cycle
E0319 19:35:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:35:43.409797 543705 memory.go:191] Add success.
I0319 19:35:43.409798 543705 cpu.go:282] Add success.
I0319 19:35:43.419844 543705 net.go:648] Add success.
I0319 19:35:43.423081 543705 net.go:770] primary dev: ETH0
I0319 19:35:43.423094 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:35:43.423107 543705 net.go:698] Add success.
I0319 19:35:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:35:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:35:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:35:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:35:53.409807 543705 memory.go:184] no items to output this cycle
I0319 19:35:53.409817 543705 cpu.go:275] no items to output this cycle
E0319 19:36:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:36:03.409809 543705 memory.go:184] no items to output this cycle
I0319 19:36:03.409826 543705 cpu.go:275] no items to output this cycle
E0319 19:36:13.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:36:13.409881 543705 memory.go:191] Add success.
W0319 19:36:13.409919 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:36:13.409933 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:36:13.409936 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:36:13.409947 543705 cpu.go:282] Add success.
I0319 19:36:13.419735 543705 net.go:648] Add success.
I0319 19:36:13.422320 543705 net.go:770] primary dev: ETH0
I0319 19:36:13.422334 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:36:13.422347 543705 net.go:698] Add success.
I0319 19:36:13.463416 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8764d19b-7e44-409d-bf8c-cc1e5782048e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 19:36:13.463446 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 19:36:14.454981 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:36:14.455128 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:36:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0319 19:36:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:36:14.456717 543705 disk_worker.go:494] system disk:vda1
I0319 19:36:14.456745 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:36:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:36:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:36:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:36:16.458078 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:36:16.472434 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:36:20.651887 543705 disk_info.go:125] begin check local disk info of client
I0319 19:36:20.654331 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:36:20.654337 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004bea80 0xc0004beac0]
E0319 19:36:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:36:23.409797 543705 memory.go:184] no items to output this cycle
I0319 19:36:23.409811 543705 cpu.go:275] no items to output this cycle
E0319 19:36:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:36:33.409775 543705 memory.go:184] no items to output this cycle
I0319 19:36:33.409806 543705 cpu.go:275] no items to output this cycle
I0319 19:36:37.833736 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 19:36:37.833743 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 19:36:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:36:43.410821 543705 memory.go:191] Add success.
I0319 19:36:43.409808 543705 cpu.go:282] Add success.
I0319 19:36:43.420538 543705 net.go:648] Add success.
I0319 19:36:43.423581 543705 net.go:770] primary dev: ETH0
I0319 19:36:43.423597 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:36:43.423610 543705 net.go:698] Add success.
I0319 19:36:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:36:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:36:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:36:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:36:53.409789 543705 memory.go:184] no items to output this cycle
I0319 19:36:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 19:37:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:37:03.409781 543705 memory.go:184] no items to output this cycle
I0319 19:37:03.409784 543705 cpu.go:275] no items to output this cycle
E0319 19:37:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:37:13.409831 543705 memory.go:191] Add success.
I0319 19:37:13.409835 543705 cpu.go:282] Add success.
W0319 19:37:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:37:13.409879 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:37:13.409883 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:37:13.420190 543705 net.go:648] Add success.
I0319 19:37:13.422929 543705 net.go:770] primary dev: ETH0
I0319 19:37:13.422942 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:37:13.422954 543705 net.go:698] Add success.
I0319 19:37:13.453493 543705 event_worker.go:152] Polling the log file for events...
W0319 19:37:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:37:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0319 19:37:14.455184 543705 disk_worker.go:728] disk inode is not compliant
E0319 19:37:14.456953 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 19:37:14.456963 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 19:37:14.456969 543705 custom_config.go:64] query custom config with name: gpu
I0319 19:37:14.457012 543705 disk_worker.go:494] system disk:vda1
I0319 19:37:14.457041 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 19:37:15.456808 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 19:37:15.456817 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:37:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 19:37:16.457983 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 19:37:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:37:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:37:16.472391 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:37:20.654419 543705 disk_info.go:125] begin check local disk info of client
I0319 19:37:20.656866 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:37:20.656874 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003718c0 0xc000371900]
E0319 19:37:23.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:37:23.409760 543705 memory.go:184] no items to output this cycle
I0319 19:37:23.409785 543705 cpu.go:275] no items to output this cycle
E0319 19:37:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:37:33.409808 543705 memory.go:184] no items to output this cycle
I0319 19:37:33.409821 543705 cpu.go:275] no items to output this cycle
E0319 19:37:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:37:43.409796 543705 memory.go:191] Add success.
I0319 19:37:43.409795 543705 cpu.go:282] Add success.
I0319 19:37:43.419996 543705 net.go:648] Add success.
I0319 19:37:43.422660 543705 net.go:770] primary dev: ETH0
I0319 19:37:43.422676 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:37:43.422689 543705 net.go:698] Add success.
I0319 19:37:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:37:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:37:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:37:53.410267 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:37:53.410282 543705 memory.go:184] no items to output this cycle
I0319 19:37:53.410287 543705 cpu.go:275] no items to output this cycle
E0319 19:38:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:38:03.409802 543705 memory.go:184] no items to output this cycle
I0319 19:38:03.409814 543705 cpu.go:275] no items to output this cycle
E0319 19:38:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:38:13.409778 543705 memory.go:191] Add success.
W0319 19:38:13.409981 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:38:13.409996 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:38:13.409996 543705 cpu.go:282] Add success.
I0319 19:38:13.409999 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:38:13.419716 543705 net.go:648] Add success.
I0319 19:38:13.422471 543705 net.go:770] primary dev: ETH0
I0319 19:38:13.422485 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:38:13.422499 543705 net.go:698] Add success.
I0319 19:38:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:38:14.455169 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:38:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0319 19:38:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:38:14.456573 543705 disk_worker.go:494] system disk:vda1
I0319 19:38:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:38:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:38:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:38:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:38:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:38:16.472433 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:38:20.657675 543705 disk_info.go:125] begin check local disk info of client
I0319 19:38:20.660205 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:38:20.660212 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bf000 0xc0002bf040]
E0319 19:38:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:38:23.409792 543705 memory.go:184] no items to output this cycle
I0319 19:38:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 19:38:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:38:33.409773 543705 memory.go:184] no items to output this cycle
I0319 19:38:33.409799 543705 cpu.go:275] no items to output this cycle
E0319 19:38:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:38:43.409791 543705 memory.go:191] Add success.
I0319 19:38:43.409791 543705 cpu.go:282] Add success.
I0319 19:38:43.419953 543705 net.go:648] Add success.
I0319 19:38:43.422793 543705 net.go:770] primary dev: ETH0
I0319 19:38:43.422809 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:38:43.422824 543705 net.go:698] Add success.
I0319 19:38:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:38:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:38:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:38:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:38:53.409809 543705 memory.go:184] no items to output this cycle
I0319 19:38:53.409821 543705 cpu.go:275] no items to output this cycle
E0319 19:39:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:39:03.409773 543705 memory.go:184] no items to output this cycle
I0319 19:39:03.409781 543705 cpu.go:275] no items to output this cycle
E0319 19:39:13.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:39:13.409924 543705 memory.go:191] Add success.
I0319 19:39:13.409946 543705 cpu.go:282] Add success.
W0319 19:39:13.409960 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:39:13.409974 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:39:13.409979 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:39:13.419709 543705 net.go:648] Add success.
I0319 19:39:13.422428 543705 net.go:770] primary dev: ETH0
I0319 19:39:13.422441 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:39:13.422452 543705 net.go:698] Add success.
I0319 19:39:13.601412 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"acfec661-488a-442c-b4d6-885c70cb769d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 19:39:13.601453 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 19:39:14.453971 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:39:14.455203 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:39:14.455279 543705 disk_worker.go:708] disk space is not compliant
W0319 19:39:14.455282 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:39:14.456847 543705 disk_worker.go:494] system disk:vda1
I0319 19:39:14.456876 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:39:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:39:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:39:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:39:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:39:16.472367 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:39:20.660313 543705 disk_info.go:125] begin check local disk info of client
I0319 19:39:20.662796 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:39:20.662803 543705 disk_info.go:196] parse disk info done, disk is : [0xc00055ca40 0xc00055ca80]
E0319 19:39:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:39:23.409795 543705 memory.go:184] no items to output this cycle
I0319 19:39:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 19:39:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:39:33.409782 543705 memory.go:184] no items to output this cycle
I0319 19:39:33.409788 543705 cpu.go:275] no items to output this cycle
I0319 19:39:37.835868 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 19:39:37.835875 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 19:39:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:39:43.410703 543705 memory.go:191] Add success.
I0319 19:39:43.409791 543705 cpu.go:282] Add success.
I0319 19:39:43.420421 543705 net.go:648] Add success.
I0319 19:39:43.423236 543705 net.go:770] primary dev: ETH0
I0319 19:39:43.423251 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:39:43.423268 543705 net.go:698] Add success.
I0319 19:39:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:39:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:39:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:39:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:39:53.409796 543705 memory.go:184] no items to output this cycle
I0319 19:39:53.409811 543705 cpu.go:275] no items to output this cycle
E0319 19:40:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:40:03.409782 543705 memory.go:184] no items to output this cycle
I0319 19:40:03.409804 543705 cpu.go:275] no items to output this cycle
E0319 19:40:13.409864 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:40:13.409892 543705 memory.go:191] Add success.
W0319 19:40:13.409929 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:40:13.409999 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:40:13.410010 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:40:13.410221 543705 cpu.go:282] Add success.
I0319 19:40:13.419705 543705 net.go:648] Add success.
I0319 19:40:13.422408 543705 net.go:770] primary dev: ETH0
I0319 19:40:13.422423 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:40:13.422437 543705 net.go:698] Add success.
I0319 19:40:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:40:14.455137 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:40:14.455147 543705 disk_worker.go:708] disk space is not compliant
W0319 19:40:14.455150 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:40:14.456483 543705 disk_worker.go:494] system disk:vda1
I0319 19:40:14.456526 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:40:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:40:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:40:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:40:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:40:16.472388 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:40:20.662886 543705 disk_info.go:125] begin check local disk info of client
I0319 19:40:20.665388 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:40:20.665394 543705 disk_info.go:196] parse disk info done, disk is : [0xc000578380 0xc0005783c0]
E0319 19:40:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:40:23.409774 543705 memory.go:184] no items to output this cycle
I0319 19:40:23.409831 543705 cpu.go:275] no items to output this cycle
E0319 19:40:33.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:40:33.409813 543705 memory.go:184] no items to output this cycle
I0319 19:40:33.409826 543705 cpu.go:275] no items to output this cycle
E0319 19:40:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:40:43.409784 543705 memory.go:191] Add success.
I0319 19:40:43.409809 543705 cpu.go:282] Add success.
I0319 19:40:43.419942 543705 net.go:648] Add success.
I0319 19:40:43.422983 543705 net.go:770] primary dev: ETH0
I0319 19:40:43.422996 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:40:43.423009 543705 net.go:698] Add success.
I0319 19:40:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:40:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:40:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:40:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:40:53.409785 543705 memory.go:184] no items to output this cycle
I0319 19:40:53.409788 543705 cpu.go:275] no items to output this cycle
E0319 19:41:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:41:03.409789 543705 memory.go:184] no items to output this cycle
I0319 19:41:03.409790 543705 cpu.go:275] no items to output this cycle
W0319 19:41:13.409710 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:41:13.409729 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:41:13.409734 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0319 19:41:13.409811 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:41:13.409829 543705 memory.go:191] Add success.
I0319 19:41:13.409837 543705 cpu.go:282] Add success.
I0319 19:41:13.420238 543705 net.go:648] Add success.
I0319 19:41:13.422971 543705 net.go:770] primary dev: ETH0
I0319 19:41:13.422986 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:41:13.423000 543705 net.go:698] Add success.
I0319 19:41:14.454921 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:41:14.455094 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:41:14.455104 543705 disk_worker.go:708] disk space is not compliant
W0319 19:41:14.455106 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:41:14.456414 543705 disk_worker.go:494] system disk:vda1
I0319 19:41:14.456475 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:41:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:41:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:41:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:41:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:41:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:41:20.665684 543705 disk_info.go:125] begin check local disk info of client
I0319 19:41:20.668264 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:41:20.668270 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002aea00 0xc0002aea40]
E0319 19:41:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:41:23.409799 543705 memory.go:184] no items to output this cycle
I0319 19:41:23.409812 543705 cpu.go:275] no items to output this cycle
E0319 19:41:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:41:33.409783 543705 memory.go:184] no items to output this cycle
I0319 19:41:33.409786 543705 cpu.go:275] no items to output this cycle
E0319 19:41:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:41:43.409778 543705 memory.go:191] Add success.
I0319 19:41:43.409801 543705 cpu.go:282] Add success.
I0319 19:41:43.419883 543705 net.go:648] Add success.
I0319 19:41:43.422630 543705 net.go:770] primary dev: ETH0
I0319 19:41:43.422643 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:41:43.422656 543705 net.go:698] Add success.
I0319 19:41:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:41:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:41:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:41:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:41:53.409776 543705 memory.go:184] no items to output this cycle
I0319 19:41:53.409781 543705 cpu.go:275] no items to output this cycle
E0319 19:42:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:42:03.409779 543705 memory.go:184] no items to output this cycle
I0319 19:42:03.409804 543705 cpu.go:275] no items to output this cycle
E0319 19:42:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:42:13.409819 543705 memory.go:191] Add success.
I0319 19:42:13.409825 543705 cpu.go:282] Add success.
W0319 19:42:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:42:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:42:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:42:13.420171 543705 net.go:648] Add success.
I0319 19:42:13.423013 543705 net.go:770] primary dev: ETH0
I0319 19:42:13.423027 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:42:13.423038 543705 net.go:698] Add success.
I0319 19:42:13.463656 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"bf3cb589-20cc-48e3-9d0c-5bb6402d4faf","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 19:42:13.463689 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 19:42:14.455085 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:42:14.455145 543705 disk_worker.go:708] disk space is not compliant
W0319 19:42:14.455148 543705 disk_worker.go:728] disk inode is not compliant
E0319 19:42:14.457016 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 19:42:14.457023 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 19:42:14.457027 543705 custom_config.go:64] query custom config with name: gpu
I0319 19:42:14.457043 543705 disk_worker.go:494] system disk:vda1
I0319 19:42:14.457094 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 19:42:15.456811 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 19:42:15.456820 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:42:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 19:42:16.457995 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 19:42:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:42:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:42:16.472407 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:42:20.668980 543705 disk_info.go:125] begin check local disk info of client
I0319 19:42:20.671419 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:42:20.671426 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037ec00 0xc00037ec40]
E0319 19:42:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:42:23.409772 543705 memory.go:184] no items to output this cycle
I0319 19:42:23.409775 543705 cpu.go:275] no items to output this cycle
E0319 19:42:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:42:33.409809 543705 memory.go:184] no items to output this cycle
I0319 19:42:33.409824 543705 cpu.go:275] no items to output this cycle
I0319 19:42:37.837734 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 19:42:37.837740 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 19:42:43.410231 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:42:43.411167 543705 memory.go:191] Add success.
I0319 19:42:43.410287 543705 cpu.go:282] Add success.
I0319 19:42:43.419956 543705 net.go:648] Add success.
I0319 19:42:43.422513 543705 net.go:770] primary dev: ETH0
I0319 19:42:43.422528 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:42:43.422542 543705 net.go:698] Add success.
I0319 19:42:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:42:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:42:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:42:53.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:42:53.409808 543705 memory.go:184] no items to output this cycle
I0319 19:42:53.409807 543705 cpu.go:275] no items to output this cycle
E0319 19:43:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:43:03.409774 543705 memory.go:184] no items to output this cycle
I0319 19:43:03.409799 543705 cpu.go:275] no items to output this cycle
E0319 19:43:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:43:13.409817 543705 memory.go:191] Add success.
I0319 19:43:13.409826 543705 cpu.go:282] Add success.
W0319 19:43:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:43:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:43:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:43:13.420277 543705 net.go:648] Add success.
I0319 19:43:13.423203 543705 net.go:770] primary dev: ETH0
I0319 19:43:13.423217 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:43:13.423229 543705 net.go:698] Add success.
I0319 19:43:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:43:14.455085 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:43:14.455147 543705 disk_worker.go:708] disk space is not compliant
W0319 19:43:14.455150 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:43:14.456457 543705 disk_worker.go:494] system disk:vda1
I0319 19:43:14.456500 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:43:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:43:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:43:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:43:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:43:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:43:20.672001 543705 disk_info.go:125] begin check local disk info of client
I0319 19:43:20.674505 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:43:20.674512 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004cb280 0xc0004cb2c0]
E0319 19:43:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:43:23.409776 543705 memory.go:184] no items to output this cycle
I0319 19:43:23.409785 543705 cpu.go:275] no items to output this cycle
E0319 19:43:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:43:33.409788 543705 memory.go:184] no items to output this cycle
I0319 19:43:33.409790 543705 cpu.go:275] no items to output this cycle
E0319 19:43:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:43:43.409822 543705 memory.go:191] Add success.
I0319 19:43:43.409833 543705 cpu.go:282] Add success.
I0319 19:43:43.419915 543705 net.go:648] Add success.
I0319 19:43:43.422702 543705 net.go:770] primary dev: ETH0
I0319 19:43:43.422716 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:43:43.422730 543705 net.go:698] Add success.
I0319 19:43:46.457994 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:43:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:43:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:43:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:43:53.409779 543705 memory.go:184] no items to output this cycle
I0319 19:43:53.409807 543705 cpu.go:275] no items to output this cycle
E0319 19:44:03.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:44:03.409816 543705 memory.go:184] no items to output this cycle
I0319 19:44:03.409826 543705 cpu.go:275] no items to output this cycle
E0319 19:44:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:44:13.409797 543705 memory.go:191] Add success.
W0319 19:44:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 19:44:13.409824 543705 cpu.go:282] Add success.
W0319 19:44:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:44:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:44:13.420370 543705 net.go:648] Add success.
I0319 19:44:13.423610 543705 net.go:770] primary dev: ETH0
I0319 19:44:13.423623 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:44:13.423634 543705 net.go:698] Add success.
I0319 19:44:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:44:14.455351 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:44:14.455445 543705 disk_worker.go:708] disk space is not compliant
W0319 19:44:14.455449 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:44:14.457057 543705 disk_worker.go:494] system disk:vda1
I0319 19:44:14.457085 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:44:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:44:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:44:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:44:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:44:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:44:20.675012 543705 disk_info.go:125] begin check local disk info of client
I0319 19:44:20.677413 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:44:20.677419 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004bd3c0 0xc0004bd400]
E0319 19:44:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:44:23.409805 543705 memory.go:184] no items to output this cycle
I0319 19:44:23.409819 543705 cpu.go:275] no items to output this cycle
E0319 19:44:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:44:33.409795 543705 memory.go:184] no items to output this cycle
I0319 19:44:33.409803 543705 cpu.go:275] no items to output this cycle
E0319 19:44:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:44:43.409805 543705 memory.go:191] Add success.
I0319 19:44:43.409808 543705 cpu.go:282] Add success.
I0319 19:44:43.419872 543705 net.go:648] Add success.
I0319 19:44:43.422709 543705 net.go:770] primary dev: ETH0
I0319 19:44:43.422722 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:44:43.422734 543705 net.go:698] Add success.
I0319 19:44:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:44:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:44:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:44:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:44:53.409789 543705 memory.go:184] no items to output this cycle
I0319 19:44:53.409817 543705 cpu.go:275] no items to output this cycle
E0319 19:45:03.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:45:03.409817 543705 memory.go:184] no items to output this cycle
I0319 19:45:03.409830 543705 cpu.go:275] no items to output this cycle
E0319 19:45:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:45:13.409800 543705 memory.go:191] Add success.
I0319 19:45:13.409801 543705 cpu.go:282] Add success.
W0319 19:45:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:45:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:45:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:45:13.420117 543705 net.go:648] Add success.
I0319 19:45:13.423141 543705 net.go:770] primary dev: ETH0
I0319 19:45:13.423154 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:45:13.423166 543705 net.go:698] Add success.
I0319 19:45:13.469024 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"00c13ee2-69e2-4120-b620-f0ccf0b84c96","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 19:45:13.469056 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 19:45:14.454978 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:45:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:45:14.455218 543705 disk_worker.go:708] disk space is not compliant
W0319 19:45:14.455272 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:45:14.456889 543705 disk_worker.go:494] system disk:vda1
I0319 19:45:14.456929 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:45:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:45:16.457996 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:45:16.458064 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:45:16.458092 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:45:16.472428 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:45:20.677673 543705 disk_info.go:125] begin check local disk info of client
I0319 19:45:20.680165 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:45:20.680171 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002ae300 0xc0002ae340]
E0319 19:45:23.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:45:23.409761 543705 memory.go:184] no items to output this cycle
I0319 19:45:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 19:45:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:45:33.409808 543705 memory.go:184] no items to output this cycle
I0319 19:45:33.409838 543705 cpu.go:275] no items to output this cycle
I0319 19:45:37.837883 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 19:45:37.837890 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 19:45:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:45:43.410660 543705 memory.go:191] Add success.
I0319 19:45:43.409802 543705 cpu.go:282] Add success.
I0319 19:45:43.420381 543705 net.go:648] Add success.
I0319 19:45:43.423128 543705 net.go:770] primary dev: ETH0
I0319 19:45:43.423141 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:45:43.423154 543705 net.go:698] Add success.
I0319 19:45:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:45:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:45:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:45:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:45:53.409769 543705 memory.go:184] no items to output this cycle
I0319 19:45:53.409799 543705 cpu.go:275] no items to output this cycle
E0319 19:46:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:46:03.409782 543705 memory.go:184] no items to output this cycle
I0319 19:46:03.409788 543705 cpu.go:275] no items to output this cycle
E0319 19:46:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:46:13.409793 543705 memory.go:191] Add success.
I0319 19:46:13.409796 543705 cpu.go:282] Add success.
W0319 19:46:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:46:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:46:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:46:13.420295 543705 net.go:648] Add success.
I0319 19:46:13.423367 543705 net.go:770] primary dev: ETH0
I0319 19:46:13.423385 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:46:13.423399 543705 net.go:698] Add success.
I0319 19:46:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:46:14.455203 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:46:14.455216 543705 disk_worker.go:708] disk space is not compliant
W0319 19:46:14.455218 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:46:14.457675 543705 disk_worker.go:494] system disk:vda1
I0319 19:46:14.457718 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:46:15.456017 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:46:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:46:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:46:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:46:16.472372 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:46:20.681036 543705 disk_info.go:125] begin check local disk info of client
I0319 19:46:20.683479 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:46:20.683484 543705 disk_info.go:196] parse disk info done, disk is : [0xc000482080 0xc0004820c0]
E0319 19:46:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:46:23.409759 543705 memory.go:184] no items to output this cycle
I0319 19:46:23.409795 543705 cpu.go:275] no items to output this cycle
E0319 19:46:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:46:33.409804 543705 memory.go:184] no items to output this cycle
I0319 19:46:33.409815 543705 cpu.go:275] no items to output this cycle
E0319 19:46:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:46:43.409783 543705 memory.go:191] Add success.
I0319 19:46:43.409806 543705 cpu.go:282] Add success.
I0319 19:46:43.419902 543705 net.go:648] Add success.
I0319 19:46:43.422681 543705 net.go:770] primary dev: ETH0
I0319 19:46:43.422695 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:46:43.422711 543705 net.go:698] Add success.
I0319 19:46:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:46:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:46:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:46:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:46:53.409790 543705 memory.go:184] no items to output this cycle
I0319 19:46:53.409790 543705 cpu.go:275] no items to output this cycle
E0319 19:47:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:47:03.409773 543705 memory.go:184] no items to output this cycle
I0319 19:47:03.409809 543705 cpu.go:275] no items to output this cycle
E0319 19:47:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:47:13.409811 543705 memory.go:191] Add success.
I0319 19:47:13.409819 543705 cpu.go:282] Add success.
W0319 19:47:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:47:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:47:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:47:13.420280 543705 net.go:648] Add success.
I0319 19:47:13.423254 543705 net.go:770] primary dev: ETH0
I0319 19:47:13.423282 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:47:13.423294 543705 net.go:698] Add success.
I0319 19:47:13.452935 543705 event_worker.go:152] Polling the log file for events...
W0319 19:47:14.455187 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:47:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0319 19:47:14.455200 543705 disk_worker.go:728] disk inode is not compliant
E0319 19:47:14.455892 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 19:47:14.455901 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 19:47:14.455907 543705 custom_config.go:64] query custom config with name: gpu
I0319 19:47:14.456540 543705 disk_worker.go:494] system disk:vda1
I0319 19:47:14.456635 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 19:47:15.456861 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 19:47:15.456870 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:47:16.457920 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 19:47:16.457920 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 19:47:16.457976 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:47:16.458005 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:47:16.472322 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:47:20.684060 543705 disk_info.go:125] begin check local disk info of client
I0319 19:47:20.686454 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:47:20.686459 543705 disk_info.go:196] parse disk info done, disk is : [0xc00028c500 0xc00028c540]
E0319 19:47:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:47:23.409793 543705 memory.go:184] no items to output this cycle
I0319 19:47:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 19:47:33.410321 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:47:33.410341 543705 memory.go:184] no items to output this cycle
I0319 19:47:33.410351 543705 cpu.go:275] no items to output this cycle
E0319 19:47:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:47:43.409785 543705 memory.go:191] Add success.
I0319 19:47:43.409787 543705 cpu.go:282] Add success.
I0319 19:47:43.419881 543705 net.go:648] Add success.
I0319 19:47:43.422590 543705 net.go:770] primary dev: ETH0
I0319 19:47:43.422605 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:47:43.422619 543705 net.go:698] Add success.
I0319 19:47:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:47:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:47:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:47:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:47:53.409786 543705 memory.go:184] no items to output this cycle
I0319 19:47:53.409787 543705 cpu.go:275] no items to output this cycle
E0319 19:48:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:48:03.409783 543705 memory.go:184] no items to output this cycle
I0319 19:48:03.409789 543705 cpu.go:275] no items to output this cycle
E0319 19:48:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:48:13.409796 543705 memory.go:191] Add success.
I0319 19:48:13.409797 543705 cpu.go:282] Add success.
W0319 19:48:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:48:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:48:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:48:13.420613 543705 net.go:648] Add success.
I0319 19:48:13.423223 543705 net.go:770] primary dev: ETH0
I0319 19:48:13.423236 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:48:13.423250 543705 net.go:698] Add success.
I0319 19:48:13.468342 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c11e6322-5b1c-48c3-8ba2-d773ed0ea310","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 19:48:13.468375 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 19:48:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:48:14.455223 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:48:14.455234 543705 disk_worker.go:708] disk space is not compliant
W0319 19:48:14.455237 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:48:14.456872 543705 disk_worker.go:494] system disk:vda1
I0319 19:48:14.456907 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:48:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:48:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:48:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:48:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:48:16.472399 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:48:20.687076 543705 disk_info.go:125] begin check local disk info of client
I0319 19:48:20.689562 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:48:20.689569 543705 disk_info.go:196] parse disk info done, disk is : [0xc000494080 0xc0004940c0]
E0319 19:48:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:48:23.409765 543705 memory.go:184] no items to output this cycle
I0319 19:48:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 19:48:33.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:48:33.409813 543705 memory.go:184] no items to output this cycle
I0319 19:48:33.409825 543705 cpu.go:275] no items to output this cycle
I0319 19:48:37.839900 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 19:48:37.839907 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 19:48:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:48:43.410628 543705 memory.go:191] Add success.
I0319 19:48:43.409827 543705 cpu.go:282] Add success.
I0319 19:48:43.420406 543705 net.go:648] Add success.
I0319 19:48:43.423189 543705 net.go:770] primary dev: ETH0
I0319 19:48:43.423202 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:48:43.423216 543705 net.go:698] Add success.
I0319 19:48:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:48:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:48:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:48:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:48:53.409806 543705 memory.go:184] no items to output this cycle
I0319 19:48:53.409809 543705 cpu.go:275] no items to output this cycle
E0319 19:49:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:49:03.409789 543705 memory.go:184] no items to output this cycle
I0319 19:49:03.409791 543705 cpu.go:275] no items to output this cycle
E0319 19:49:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:49:13.409804 543705 memory.go:191] Add success.
I0319 19:49:13.409804 543705 cpu.go:282] Add success.
W0319 19:49:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:49:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:49:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:49:13.420258 543705 net.go:648] Add success.
I0319 19:49:13.422830 543705 net.go:770] primary dev: ETH0
I0319 19:49:13.422844 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:49:13.422855 543705 net.go:698] Add success.
I0319 19:49:14.454982 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:49:14.455132 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:49:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0319 19:49:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:49:14.456596 543705 disk_worker.go:494] system disk:vda1
I0319 19:49:14.456627 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:49:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:49:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:49:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:49:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:49:16.472375 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:49:20.689674 543705 disk_info.go:125] begin check local disk info of client
I0319 19:49:20.692158 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:49:20.692164 543705 disk_info.go:196] parse disk info done, disk is : [0xc000396080 0xc0003960c0]
E0319 19:49:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:49:23.409782 543705 memory.go:184] no items to output this cycle
I0319 19:49:23.409791 543705 cpu.go:275] no items to output this cycle
E0319 19:49:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:49:33.409783 543705 memory.go:184] no items to output this cycle
I0319 19:49:33.409817 543705 cpu.go:275] no items to output this cycle
E0319 19:49:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:49:43.409795 543705 memory.go:191] Add success.
I0319 19:49:43.409794 543705 cpu.go:282] Add success.
I0319 19:49:43.419906 543705 net.go:648] Add success.
I0319 19:49:43.423010 543705 net.go:770] primary dev: ETH0
I0319 19:49:43.423026 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:49:43.423043 543705 net.go:698] Add success.
I0319 19:49:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:49:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:49:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:49:53.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:49:53.409762 543705 memory.go:184] no items to output this cycle
I0319 19:49:53.409835 543705 cpu.go:275] no items to output this cycle
E0319 19:50:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:50:03.409785 543705 memory.go:184] no items to output this cycle
I0319 19:50:03.409789 543705 cpu.go:275] no items to output this cycle
E0319 19:50:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:50:13.409785 543705 memory.go:191] Add success.
I0319 19:50:13.409808 543705 cpu.go:282] Add success.
W0319 19:50:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:50:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:50:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:50:13.420178 543705 net.go:648] Add success.
I0319 19:50:13.423234 543705 net.go:770] primary dev: ETH0
I0319 19:50:13.423248 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:50:13.423260 543705 net.go:698] Add success.
I0319 19:50:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:50:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:50:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0319 19:50:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:50:14.456572 543705 disk_worker.go:494] system disk:vda1
I0319 19:50:14.456603 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:50:15.456010 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:50:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:50:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:50:16.458046 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:50:16.472373 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:50:20.693100 543705 disk_info.go:125] begin check local disk info of client
I0319 19:50:20.695527 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:50:20.695533 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047e380 0xc00047e3c0]
E0319 19:50:23.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:50:23.409758 543705 memory.go:184] no items to output this cycle
I0319 19:50:23.409792 543705 cpu.go:275] no items to output this cycle
E0319 19:50:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:50:33.409785 543705 memory.go:184] no items to output this cycle
I0319 19:50:33.409798 543705 cpu.go:275] no items to output this cycle
E0319 19:50:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:50:43.409797 543705 memory.go:191] Add success.
I0319 19:50:43.409799 543705 cpu.go:282] Add success.
I0319 19:50:43.419885 543705 net.go:648] Add success.
I0319 19:50:43.422881 543705 net.go:770] primary dev: ETH0
I0319 19:50:43.422896 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:50:43.422910 543705 net.go:698] Add success.
I0319 19:50:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:50:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:50:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:50:53.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:50:53.409814 543705 memory.go:184] no items to output this cycle
I0319 19:50:53.409821 543705 cpu.go:275] no items to output this cycle
E0319 19:51:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:51:03.409773 543705 memory.go:184] no items to output this cycle
I0319 19:51:03.409802 543705 cpu.go:275] no items to output this cycle
E0319 19:51:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:51:13.409788 543705 memory.go:191] Add success.
I0319 19:51:13.409788 543705 cpu.go:282] Add success.
W0319 19:51:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:51:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:51:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:51:13.420130 543705 net.go:648] Add success.
I0319 19:51:13.422806 543705 net.go:770] primary dev: ETH0
I0319 19:51:13.422819 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:51:13.422830 543705 net.go:698] Add success.
I0319 19:51:13.473776 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a4239847-79ab-46ea-bc3a-2f4d037c8bf4","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 19:51:13.473811 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 19:51:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:51:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:51:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0319 19:51:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:51:14.457181 543705 disk_worker.go:494] system disk:vda1
I0319 19:51:14.457210 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:51:15.455603 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:51:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:51:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:51:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:51:16.472401 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:51:20.695614 543705 disk_info.go:125] begin check local disk info of client
I0319 19:51:20.698101 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:51:20.698107 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003fe300 0xc0003fe340]
E0319 19:51:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:51:23.409790 543705 memory.go:184] no items to output this cycle
I0319 19:51:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 19:51:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:51:33.409784 543705 memory.go:184] no items to output this cycle
I0319 19:51:33.409788 543705 cpu.go:275] no items to output this cycle
I0319 19:51:37.841734 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 19:51:37.841741 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 19:51:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:51:43.410892 543705 memory.go:191] Add success.
I0319 19:51:43.409806 543705 cpu.go:282] Add success.
I0319 19:51:43.420655 543705 net.go:648] Add success.
I0319 19:51:43.423889 543705 net.go:770] primary dev: ETH0
I0319 19:51:43.423902 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:51:43.423915 543705 net.go:698] Add success.
I0319 19:51:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:51:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:51:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:51:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:51:53.409770 543705 memory.go:184] no items to output this cycle
I0319 19:51:53.409807 543705 cpu.go:275] no items to output this cycle
E0319 19:52:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:52:03.409783 543705 memory.go:184] no items to output this cycle
I0319 19:52:03.409813 543705 cpu.go:275] no items to output this cycle
E0319 19:52:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:52:13.409811 543705 memory.go:191] Add success.
I0319 19:52:13.409816 543705 cpu.go:282] Add success.
W0319 19:52:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:52:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:52:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:52:13.420137 543705 net.go:648] Add success.
I0319 19:52:13.423177 543705 net.go:770] primary dev: ETH0
I0319 19:52:13.423192 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:52:13.423207 543705 net.go:698] Add success.
W0319 19:52:14.455160 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:52:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0319 19:52:14.455173 543705 disk_worker.go:728] disk inode is not compliant
E0319 19:52:14.456128 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 19:52:14.456137 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 19:52:14.456144 543705 custom_config.go:64] query custom config with name: gpu
I0319 19:52:14.456422 543705 disk_worker.go:494] system disk:vda1
I0319 19:52:14.456454 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 19:52:15.456827 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 19:52:15.456838 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:52:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 19:52:16.457976 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 19:52:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:52:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:52:16.472398 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:52:20.699135 543705 disk_info.go:125] begin check local disk info of client
I0319 19:52:20.701617 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:52:20.701624 543705 disk_info.go:196] parse disk info done, disk is : [0xc000482140 0xc000482180]
E0319 19:52:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:52:23.409763 543705 memory.go:184] no items to output this cycle
I0319 19:52:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 19:52:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:52:33.409773 543705 memory.go:184] no items to output this cycle
I0319 19:52:33.409798 543705 cpu.go:275] no items to output this cycle
E0319 19:52:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:52:43.409816 543705 memory.go:191] Add success.
I0319 19:52:43.409824 543705 cpu.go:282] Add success.
I0319 19:52:43.419867 543705 net.go:648] Add success.
I0319 19:52:43.422559 543705 net.go:770] primary dev: ETH0
I0319 19:52:43.422572 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:52:43.422585 543705 net.go:698] Add success.
I0319 19:52:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:52:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:52:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:52:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:52:53.409794 543705 memory.go:184] no items to output this cycle
I0319 19:52:53.409796 543705 cpu.go:275] no items to output this cycle
E0319 19:53:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:53:03.409775 543705 memory.go:184] no items to output this cycle
I0319 19:53:03.409779 543705 cpu.go:275] no items to output this cycle
E0319 19:53:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:53:13.409786 543705 memory.go:191] Add success.
I0319 19:53:13.409811 543705 cpu.go:282] Add success.
W0319 19:53:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:53:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:53:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:53:13.420190 543705 net.go:648] Add success.
I0319 19:53:13.423057 543705 net.go:770] primary dev: ETH0
I0319 19:53:13.423071 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:53:13.423085 543705 net.go:698] Add success.
I0319 19:53:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:53:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:53:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0319 19:53:14.455182 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:53:14.456551 543705 disk_worker.go:494] system disk:vda1
I0319 19:53:14.456598 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:53:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:53:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:53:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:53:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:53:16.472463 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:53:20.701682 543705 disk_info.go:125] begin check local disk info of client
I0319 19:53:20.704159 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:53:20.704165 543705 disk_info.go:196] parse disk info done, disk is : [0xc000382100 0xc000382140]
E0319 19:53:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:53:23.409795 543705 memory.go:184] no items to output this cycle
I0319 19:53:23.409808 543705 cpu.go:275] no items to output this cycle
E0319 19:53:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:53:33.409768 543705 memory.go:184] no items to output this cycle
I0319 19:53:33.409795 543705 cpu.go:275] no items to output this cycle
E0319 19:53:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:53:43.409823 543705 memory.go:191] Add success.
I0319 19:53:43.409826 543705 cpu.go:282] Add success.
I0319 19:53:43.419905 543705 net.go:648] Add success.
I0319 19:53:43.422982 543705 net.go:770] primary dev: ETH0
I0319 19:53:43.422996 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:53:43.423008 543705 net.go:698] Add success.
I0319 19:53:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:53:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:53:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:53:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:53:53.409794 543705 memory.go:184] no items to output this cycle
I0319 19:53:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 19:54:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:54:03.409789 543705 memory.go:184] no items to output this cycle
I0319 19:54:03.409804 543705 cpu.go:275] no items to output this cycle
E0319 19:54:13.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:54:13.409834 543705 memory.go:191] Add success.
I0319 19:54:13.409834 543705 cpu.go:282] Add success.
W0319 19:54:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:54:13.409875 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:54:13.409878 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:54:13.420172 543705 net.go:648] Add success.
I0319 19:54:13.422759 543705 net.go:770] primary dev: ETH0
I0319 19:54:13.422772 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:54:13.422785 543705 net.go:698] Add success.
I0319 19:54:13.468856 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d01bd806-9dd3-4889-8637-2303f66e46c6","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 19:54:13.468889 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 19:54:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:54:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:54:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0319 19:54:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:54:14.456604 543705 disk_worker.go:494] system disk:vda1
I0319 19:54:14.456635 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:54:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:54:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:54:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:54:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:54:16.472377 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:54:20.705200 543705 disk_info.go:125] begin check local disk info of client
I0319 19:54:20.707704 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:54:20.707712 543705 disk_info.go:196] parse disk info done, disk is : [0xc000342200 0xc000342240]
E0319 19:54:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:54:23.409800 543705 memory.go:184] no items to output this cycle
I0319 19:54:23.409810 543705 cpu.go:275] no items to output this cycle
E0319 19:54:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:54:33.409812 543705 memory.go:184] no items to output this cycle
I0319 19:54:33.409823 543705 cpu.go:275] no items to output this cycle
I0319 19:54:37.843907 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 19:54:37.843913 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 19:54:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:54:43.410637 543705 memory.go:191] Add success.
I0319 19:54:43.409804 543705 cpu.go:282] Add success.
I0319 19:54:43.420363 543705 net.go:648] Add success.
I0319 19:54:43.423208 543705 net.go:770] primary dev: ETH0
I0319 19:54:43.423222 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:54:43.423234 543705 net.go:698] Add success.
I0319 19:54:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:54:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:54:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:54:53.410264 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:54:53.410282 543705 memory.go:184] no items to output this cycle
I0319 19:54:53.410288 543705 cpu.go:275] no items to output this cycle
E0319 19:55:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:55:03.409785 543705 memory.go:184] no items to output this cycle
I0319 19:55:03.409808 543705 cpu.go:275] no items to output this cycle
E0319 19:55:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:55:13.409805 543705 memory.go:191] Add success.
I0319 19:55:13.409817 543705 cpu.go:282] Add success.
W0319 19:55:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:55:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:55:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:55:13.420068 543705 net.go:648] Add success.
I0319 19:55:13.422682 543705 net.go:770] primary dev: ETH0
I0319 19:55:13.422694 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:55:13.422706 543705 net.go:698] Add success.
I0319 19:55:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:55:14.455151 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:55:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0319 19:55:14.455165 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:55:14.456493 543705 disk_worker.go:494] system disk:vda1
I0319 19:55:14.456537 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:55:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:55:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:55:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:55:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:55:16.472391 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:55:20.709174 543705 disk_info.go:125] begin check local disk info of client
I0319 19:55:20.711603 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:55:20.711609 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001f7c80 0xc0001f7cc0]
E0319 19:55:23.409861 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:55:23.409878 543705 memory.go:184] no items to output this cycle
I0319 19:55:23.409964 543705 cpu.go:275] no items to output this cycle
E0319 19:55:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:55:33.409783 543705 memory.go:184] no items to output this cycle
I0319 19:55:33.409809 543705 cpu.go:275] no items to output this cycle
E0319 19:55:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:55:43.409782 543705 memory.go:191] Add success.
I0319 19:55:43.409802 543705 cpu.go:282] Add success.
I0319 19:55:43.419962 543705 net.go:648] Add success.
I0319 19:55:43.422499 543705 net.go:770] primary dev: ETH0
I0319 19:55:43.422512 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:55:43.422524 543705 net.go:698] Add success.
I0319 19:55:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:55:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:55:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:55:53.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:55:53.409820 543705 cpu.go:275] no items to output this cycle
I0319 19:55:53.409822 543705 memory.go:184] no items to output this cycle
E0319 19:56:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:56:03.409806 543705 memory.go:184] no items to output this cycle
I0319 19:56:03.409821 543705 cpu.go:275] no items to output this cycle
E0319 19:56:13.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:56:13.409776 543705 memory.go:191] Add success.
W0319 19:56:13.409801 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:56:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:56:13.409812 543705 cpu.go:282] Add success.
I0319 19:56:13.409815 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:56:13.420321 543705 net.go:648] Add success.
I0319 19:56:13.423132 543705 net.go:770] primary dev: ETH0
I0319 19:56:13.423146 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:56:13.423160 543705 net.go:698] Add success.
I0319 19:56:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:56:14.455143 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:56:14.455153 543705 disk_worker.go:708] disk space is not compliant
W0319 19:56:14.455156 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:56:14.456488 543705 disk_worker.go:494] system disk:vda1
I0319 19:56:14.456534 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:56:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:56:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:56:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:56:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:56:16.472414 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:56:20.712196 543705 disk_info.go:125] begin check local disk info of client
I0319 19:56:20.714650 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:56:20.714656 543705 disk_info.go:196] parse disk info done, disk is : [0xc000234580 0xc0002345c0]
E0319 19:56:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:56:23.409792 543705 memory.go:184] no items to output this cycle
I0319 19:56:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 19:56:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:56:33.409787 543705 memory.go:184] no items to output this cycle
I0319 19:56:33.409806 543705 cpu.go:275] no items to output this cycle
E0319 19:56:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:56:43.409782 543705 memory.go:191] Add success.
I0319 19:56:43.409794 543705 cpu.go:282] Add success.
I0319 19:56:43.419868 543705 net.go:648] Add success.
I0319 19:56:43.422852 543705 net.go:770] primary dev: ETH0
I0319 19:56:43.422869 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:56:43.422884 543705 net.go:698] Add success.
I0319 19:56:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:56:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:56:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:56:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:56:53.409793 543705 cpu.go:275] no items to output this cycle
I0319 19:56:53.409799 543705 memory.go:184] no items to output this cycle
E0319 19:57:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:57:03.409796 543705 memory.go:184] no items to output this cycle
I0319 19:57:03.409807 543705 cpu.go:275] no items to output this cycle
E0319 19:57:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:57:13.409789 543705 memory.go:191] Add success.
I0319 19:57:13.409788 543705 cpu.go:282] Add success.
W0319 19:57:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:57:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:57:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:57:13.420058 543705 net.go:648] Add success.
I0319 19:57:13.422672 543705 net.go:770] primary dev: ETH0
I0319 19:57:13.422688 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:57:13.422700 543705 net.go:698] Add success.
I0319 19:57:13.428573 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 19:57:13.452842 543705 event_worker.go:152] Polling the log file for events...
I0319 19:57:13.467995 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9a9f30af-c0d5-47a4-8deb-4d7d8b16cdfd","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 19:57:13.468026 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 19:57:14.455134 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:57:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0319 19:57:14.455198 543705 disk_worker.go:728] disk inode is not compliant
E0319 19:57:14.455923 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 19:57:14.455932 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 19:57:14.455938 543705 custom_config.go:64] query custom config with name: gpu
I0319 19:57:14.456694 543705 disk_worker.go:494] system disk:vda1
I0319 19:57:14.456727 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 19:57:15.456819 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 19:57:15.456827 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:57:16.457910 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 19:57:16.457910 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 19:57:16.457972 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:57:16.457992 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:57:16.472308 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:57:20.715214 543705 disk_info.go:125] begin check local disk info of client
I0319 19:57:20.717686 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:57:20.717692 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004645c0 0xc000464600]
E0319 19:57:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:57:23.409761 543705 memory.go:184] no items to output this cycle
I0319 19:57:23.409790 543705 cpu.go:275] no items to output this cycle
E0319 19:57:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:57:33.409793 543705 cpu.go:275] no items to output this cycle
I0319 19:57:33.409797 543705 memory.go:184] no items to output this cycle
I0319 19:57:37.845739 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 19:57:37.845746 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 19:57:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:57:43.410725 543705 memory.go:191] Add success.
I0319 19:57:43.409825 543705 cpu.go:282] Add success.
I0319 19:57:43.420637 543705 net.go:648] Add success.
I0319 19:57:43.423851 543705 net.go:770] primary dev: ETH0
I0319 19:57:43.423864 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:57:43.423876 543705 net.go:698] Add success.
I0319 19:57:46.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:57:46.458067 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:57:46.458098 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:57:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:57:53.409784 543705 memory.go:184] no items to output this cycle
I0319 19:57:53.409801 543705 cpu.go:275] no items to output this cycle
E0319 19:58:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:58:03.409795 543705 cpu.go:275] no items to output this cycle
I0319 19:58:03.409797 543705 memory.go:184] no items to output this cycle
E0319 19:58:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:58:13.409781 543705 memory.go:191] Add success.
W0319 19:58:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 19:58:13.409813 543705 cpu.go:282] Add success.
W0319 19:58:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:58:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:58:13.420132 543705 net.go:648] Add success.
I0319 19:58:13.422953 543705 net.go:770] primary dev: ETH0
I0319 19:58:13.422966 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:58:13.422978 543705 net.go:698] Add success.
I0319 19:58:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:58:14.455191 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:58:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0319 19:58:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:58:14.456569 543705 disk_worker.go:494] system disk:vda1
I0319 19:58:14.456598 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:58:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:58:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:58:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:58:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:58:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:58:20.719232 543705 disk_info.go:125] begin check local disk info of client
I0319 19:58:20.721750 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:58:20.721756 543705 disk_info.go:196] parse disk info done, disk is : [0xc000256400 0xc000256440]
E0319 19:58:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:58:23.409763 543705 memory.go:184] no items to output this cycle
I0319 19:58:23.409795 543705 cpu.go:275] no items to output this cycle
E0319 19:58:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:58:33.409778 543705 memory.go:184] no items to output this cycle
I0319 19:58:33.409809 543705 cpu.go:275] no items to output this cycle
E0319 19:58:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:58:43.409916 543705 memory.go:191] Add success.
I0319 19:58:43.409944 543705 cpu.go:282] Add success.
I0319 19:58:43.419722 543705 net.go:648] Add success.
I0319 19:58:43.422585 543705 net.go:770] primary dev: ETH0
I0319 19:58:43.422598 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:58:43.422609 543705 net.go:698] Add success.
I0319 19:58:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:58:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:58:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:58:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:58:53.409772 543705 memory.go:184] no items to output this cycle
I0319 19:58:53.409808 543705 cpu.go:275] no items to output this cycle
E0319 19:59:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:59:03.409779 543705 memory.go:184] no items to output this cycle
I0319 19:59:03.409783 543705 cpu.go:275] no items to output this cycle
E0319 19:59:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:59:13.409794 543705 memory.go:191] Add success.
I0319 19:59:13.409799 543705 cpu.go:282] Add success.
W0319 19:59:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 19:59:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 19:59:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 19:59:13.420116 543705 net.go:648] Add success.
I0319 19:59:13.422778 543705 net.go:770] primary dev: ETH0
I0319 19:59:13.422792 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:59:13.422803 543705 net.go:698] Add success.
I0319 19:59:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0319 19:59:14.455191 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 19:59:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0319 19:59:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0319 19:59:14.456573 543705 disk_worker.go:494] system disk:vda1
I0319 19:59:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 19:59:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 19:59:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:59:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:59:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 19:59:16.472377 543705 disk_local_worker.go:436] Get disk info: []
I0319 19:59:20.723248 543705 disk_info.go:125] begin check local disk info of client
I0319 19:59:20.725715 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 19:59:20.725721 543705 disk_info.go:196] parse disk info done, disk is : [0xc000559900 0xc000559940]
E0319 19:59:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:59:23.409769 543705 memory.go:184] no items to output this cycle
I0319 19:59:23.409790 543705 cpu.go:275] no items to output this cycle
E0319 19:59:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:59:33.409785 543705 memory.go:184] no items to output this cycle
I0319 19:59:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 19:59:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:59:43.409810 543705 memory.go:191] Add success.
I0319 19:59:43.409823 543705 cpu.go:282] Add success.
I0319 19:59:43.420185 543705 net.go:648] Add success.
I0319 19:59:43.423025 543705 net.go:770] primary dev: ETH0
I0319 19:59:43.423038 543705 net.go:802] Send network stats successfully!,count is 6
I0319 19:59:43.423049 543705 net.go:698] Add success.
I0319 19:59:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 19:59:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 19:59:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 19:59:53.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 19:59:53.409816 543705 memory.go:184] no items to output this cycle
I0319 19:59:53.409860 543705 cpu.go:275] no items to output this cycle
E0319 20:00:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:00:03.409804 543705 memory.go:184] no items to output this cycle
I0319 20:00:03.409817 543705 cpu.go:275] no items to output this cycle
E0319 20:00:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:00:13.409787 543705 memory.go:191] Add success.
I0319 20:00:13.409809 543705 cpu.go:282] Add success.
W0319 20:00:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:00:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:00:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:00:13.420044 543705 net.go:648] Add success.
I0319 20:00:13.422540 543705 net.go:770] primary dev: ETH0
I0319 20:00:13.422553 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:00:13.422565 543705 net.go:698] Add success.
I0319 20:00:13.463720 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1688084f-3c94-41cc-b108-275873f2156f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 20:00:13.463753 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 20:00:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:00:14.455119 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:00:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 20:00:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:00:14.456666 543705 disk_worker.go:494] system disk:vda1
I0319 20:00:14.456700 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:00:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:00:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:00:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:00:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:00:16.472388 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:00:20.725802 543705 disk_info.go:125] begin check local disk info of client
I0319 20:00:20.728214 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:00:20.728219 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003c82c0 0xc0003c8300]
E0319 20:00:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:00:23.409771 543705 memory.go:184] no items to output this cycle
I0319 20:00:23.409785 543705 cpu.go:275] no items to output this cycle
E0319 20:00:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:00:33.409782 543705 memory.go:184] no items to output this cycle
I0319 20:00:33.409803 543705 cpu.go:275] no items to output this cycle
I0319 20:00:37.847940 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 20:00:37.847948 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 20:00:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:00:43.410642 543705 memory.go:191] Add success.
I0319 20:00:43.409804 543705 cpu.go:282] Add success.
I0319 20:00:43.420561 543705 net.go:648] Add success.
I0319 20:00:43.423179 543705 net.go:770] primary dev: ETH0
I0319 20:00:43.423192 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:00:43.423203 543705 net.go:698] Add success.
I0319 20:00:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:00:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:00:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:00:53.410206 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:00:53.410225 543705 memory.go:184] no items to output this cycle
I0319 20:00:53.410258 543705 cpu.go:275] no items to output this cycle
E0319 20:01:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:01:03.409799 543705 memory.go:184] no items to output this cycle
I0319 20:01:03.409810 543705 cpu.go:275] no items to output this cycle
E0319 20:01:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:01:13.409780 543705 memory.go:191] Add success.
W0319 20:01:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:01:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:01:13.409818 543705 cpu.go:282] Add success.
I0319 20:01:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:01:13.420508 543705 net.go:648] Add success.
I0319 20:01:13.423387 543705 net.go:770] primary dev: ETH0
I0319 20:01:13.423399 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:01:13.423412 543705 net.go:698] Add success.
I0319 20:01:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:01:14.455130 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:01:14.455140 543705 disk_worker.go:708] disk space is not compliant
W0319 20:01:14.455143 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:01:14.456469 543705 disk_worker.go:494] system disk:vda1
I0319 20:01:14.456516 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:01:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:01:16.458004 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:01:16.458071 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:01:16.458097 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:01:16.472432 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:01:20.729285 543705 disk_info.go:125] begin check local disk info of client
I0319 20:01:20.731767 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:01:20.731773 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003c8680 0xc0003c86c0]
E0319 20:01:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:01:23.409796 543705 memory.go:184] no items to output this cycle
I0319 20:01:23.409806 543705 cpu.go:275] no items to output this cycle
E0319 20:01:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:01:33.409812 543705 memory.go:184] no items to output this cycle
I0319 20:01:33.409827 543705 cpu.go:275] no items to output this cycle
E0319 20:01:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:01:43.409799 543705 memory.go:191] Add success.
I0319 20:01:43.409801 543705 cpu.go:282] Add success.
I0319 20:01:43.419986 543705 net.go:648] Add success.
I0319 20:01:43.423047 543705 net.go:770] primary dev: ETH0
I0319 20:01:43.423063 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:01:43.423078 543705 net.go:698] Add success.
I0319 20:01:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:01:46.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:01:46.458056 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:01:53.409868 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:01:53.409900 543705 memory.go:184] no items to output this cycle
I0319 20:01:53.410052 543705 cpu.go:275] no items to output this cycle
E0319 20:02:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:02:03.409784 543705 memory.go:184] no items to output this cycle
I0319 20:02:03.409794 543705 cpu.go:275] no items to output this cycle
E0319 20:02:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:02:13.409786 543705 memory.go:191] Add success.
W0319 20:02:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 20:02:13.409813 543705 cpu.go:282] Add success.
W0319 20:02:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:02:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:02:13.420256 543705 net.go:648] Add success.
I0319 20:02:13.423079 543705 net.go:770] primary dev: ETH0
I0319 20:02:13.423094 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:02:13.423108 543705 net.go:698] Add success.
W0319 20:02:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:02:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0319 20:02:14.455188 543705 disk_worker.go:728] disk inode is not compliant
E0319 20:02:14.455908 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 20:02:14.455916 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 20:02:14.455923 543705 custom_config.go:64] query custom config with name: gpu
I0319 20:02:14.456542 543705 disk_worker.go:494] system disk:vda1
I0319 20:02:14.456573 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 20:02:15.456807 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 20:02:15.456814 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:02:16.457906 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 20:02:16.457906 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 20:02:16.457960 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:02:16.457980 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:02:16.472294 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:02:20.731856 543705 disk_info.go:125] begin check local disk info of client
I0319 20:02:20.734305 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:02:20.734311 543705 disk_info.go:196] parse disk info done, disk is : [0xc000396540 0xc000396580]
E0319 20:02:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:02:23.409758 543705 memory.go:184] no items to output this cycle
I0319 20:02:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 20:02:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:02:33.409790 543705 memory.go:184] no items to output this cycle
I0319 20:02:33.409789 543705 cpu.go:275] no items to output this cycle
E0319 20:02:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:02:43.409795 543705 memory.go:191] Add success.
I0319 20:02:43.409804 543705 cpu.go:282] Add success.
I0319 20:02:43.419843 543705 net.go:648] Add success.
I0319 20:02:43.422808 543705 net.go:770] primary dev: ETH0
I0319 20:02:43.422821 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:02:43.422833 543705 net.go:698] Add success.
I0319 20:02:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:02:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:02:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:02:53.409880 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:02:53.409896 543705 cpu.go:275] no items to output this cycle
I0319 20:02:53.409899 543705 memory.go:184] no items to output this cycle
E0319 20:03:03.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:03:03.409763 543705 memory.go:184] no items to output this cycle
I0319 20:03:03.409810 543705 cpu.go:275] no items to output this cycle
E0319 20:03:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:03:13.409815 543705 memory.go:191] Add success.
I0319 20:03:13.409827 543705 cpu.go:282] Add success.
W0319 20:03:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:03:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:03:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:03:13.420119 543705 net.go:648] Add success.
I0319 20:03:13.422994 543705 net.go:770] primary dev: ETH0
I0319 20:03:13.423006 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:03:13.423019 543705 net.go:698] Add success.
I0319 20:03:13.525713 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"6bb4eeb0-9667-45ec-87cc-bc671b0b70fd","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 20:03:13.525745 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 20:03:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:03:14.455153 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:03:14.455163 543705 disk_worker.go:708] disk space is not compliant
W0319 20:03:14.455167 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:03:14.456503 543705 disk_worker.go:494] system disk:vda1
I0319 20:03:14.456560 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:03:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:03:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:03:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:03:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:03:16.472394 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:03:20.734399 543705 disk_info.go:125] begin check local disk info of client
I0319 20:03:20.736904 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:03:20.736910 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b6c0 0xc00007b700]
E0319 20:03:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:03:23.409796 543705 memory.go:184] no items to output this cycle
I0319 20:03:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 20:03:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:03:33.409780 543705 memory.go:184] no items to output this cycle
I0319 20:03:33.409801 543705 cpu.go:275] no items to output this cycle
I0319 20:03:37.849735 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 20:03:37.849741 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 20:03:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:03:43.410781 543705 memory.go:191] Add success.
I0319 20:03:43.409829 543705 cpu.go:282] Add success.
I0319 20:03:43.420503 543705 net.go:648] Add success.
I0319 20:03:43.423210 543705 net.go:770] primary dev: ETH0
I0319 20:03:43.423223 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:03:43.423235 543705 net.go:698] Add success.
I0319 20:03:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:03:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:03:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:03:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:03:53.409774 543705 memory.go:184] no items to output this cycle
I0319 20:03:53.409805 543705 cpu.go:275] no items to output this cycle
E0319 20:04:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:04:03.409795 543705 memory.go:184] no items to output this cycle
I0319 20:04:03.409802 543705 cpu.go:275] no items to output this cycle
E0319 20:04:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:04:13.409791 543705 memory.go:191] Add success.
I0319 20:04:13.409800 543705 cpu.go:282] Add success.
W0319 20:04:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:04:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:04:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:04:13.420207 543705 net.go:648] Add success.
I0319 20:04:13.423076 543705 net.go:770] primary dev: ETH0
I0319 20:04:13.423089 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:04:13.423101 543705 net.go:698] Add success.
I0319 20:04:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:04:14.455126 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:04:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0319 20:04:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:04:14.456580 543705 disk_worker.go:494] system disk:vda1
I0319 20:04:14.456610 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:04:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:04:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:04:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:04:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:04:16.472386 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:04:20.737685 543705 disk_info.go:125] begin check local disk info of client
I0319 20:04:20.740215 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:04:20.740222 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab180 0xc0001ab1c0]
E0319 20:04:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:04:23.409775 543705 memory.go:184] no items to output this cycle
I0319 20:04:23.409793 543705 cpu.go:275] no items to output this cycle
E0319 20:04:33.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:04:33.409819 543705 memory.go:184] no items to output this cycle
I0319 20:04:33.409835 543705 cpu.go:275] no items to output this cycle
E0319 20:04:43.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:04:43.409831 543705 memory.go:191] Add success.
I0319 20:04:43.409838 543705 cpu.go:282] Add success.
I0319 20:04:43.419707 543705 net.go:770] primary dev: ETH0
I0319 20:04:43.419722 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:04:43.419736 543705 net.go:698] Add success.
I0319 20:04:43.419966 543705 net.go:648] Add success.
I0319 20:04:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:04:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:04:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:04:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:04:53.409796 543705 cpu.go:275] no items to output this cycle
I0319 20:04:53.409797 543705 memory.go:184] no items to output this cycle
E0319 20:05:03.409891 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:05:03.409916 543705 memory.go:184] no items to output this cycle
I0319 20:05:03.409999 543705 cpu.go:275] no items to output this cycle
E0319 20:05:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:05:13.409801 543705 memory.go:191] Add success.
I0319 20:05:13.409818 543705 cpu.go:282] Add success.
W0319 20:05:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:05:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:05:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:05:13.420105 543705 net.go:648] Add success.
I0319 20:05:13.422610 543705 net.go:770] primary dev: ETH0
I0319 20:05:13.422623 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:05:13.422635 543705 net.go:698] Add success.
I0319 20:05:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:05:14.455116 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:05:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0319 20:05:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:05:14.456773 543705 disk_worker.go:494] system disk:vda1
I0319 20:05:14.456803 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:05:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:05:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:05:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:05:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:05:16.472441 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:05:20.740306 543705 disk_info.go:125] begin check local disk info of client
I0319 20:05:20.742857 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:05:20.742863 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004875c0 0xc000487600]
E0319 20:05:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:05:23.409769 543705 memory.go:184] no items to output this cycle
I0319 20:05:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 20:05:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:05:33.409809 543705 memory.go:184] no items to output this cycle
I0319 20:05:33.409822 543705 cpu.go:275] no items to output this cycle
E0319 20:05:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:05:43.409789 543705 memory.go:191] Add success.
I0319 20:05:43.409816 543705 cpu.go:282] Add success.
I0319 20:05:43.420076 543705 net.go:648] Add success.
I0319 20:05:43.422833 543705 net.go:770] primary dev: ETH0
I0319 20:05:43.422846 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:05:43.422859 543705 net.go:698] Add success.
I0319 20:05:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:05:46.458062 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:05:46.458089 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:05:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:05:53.409786 543705 cpu.go:275] no items to output this cycle
I0319 20:05:53.409787 543705 memory.go:184] no items to output this cycle
E0319 20:06:03.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:06:03.409812 543705 memory.go:184] no items to output this cycle
I0319 20:06:03.409827 543705 cpu.go:275] no items to output this cycle
E0319 20:06:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:06:13.409798 543705 memory.go:191] Add success.
I0319 20:06:13.409817 543705 cpu.go:282] Add success.
W0319 20:06:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:06:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:06:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:06:13.420126 543705 net.go:648] Add success.
I0319 20:06:13.423107 543705 net.go:770] primary dev: ETH0
I0319 20:06:13.423121 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:06:13.423136 543705 net.go:698] Add success.
I0319 20:06:13.545766 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a23c5f2d-19a5-4ac1-97b6-d5b076013ba9","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 20:06:13.545799 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 20:06:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:06:14.455187 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:06:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0319 20:06:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:06:14.456592 543705 disk_worker.go:494] system disk:vda1
I0319 20:06:14.456623 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:06:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:06:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:06:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:06:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:06:16.472373 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:06:20.744362 543705 disk_info.go:125] begin check local disk info of client
I0319 20:06:20.746748 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:06:20.746755 543705 disk_info.go:196] parse disk info done, disk is : [0xc00053a480 0xc00053a4c0]
E0319 20:06:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:06:23.409775 543705 memory.go:184] no items to output this cycle
I0319 20:06:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 20:06:33.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:06:33.409822 543705 memory.go:184] no items to output this cycle
I0319 20:06:33.409832 543705 cpu.go:275] no items to output this cycle
I0319 20:06:37.852018 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 20:06:37.852026 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 20:06:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:06:43.410714 543705 memory.go:191] Add success.
I0319 20:06:43.409820 543705 cpu.go:282] Add success.
I0319 20:06:43.420440 543705 net.go:648] Add success.
I0319 20:06:43.423189 543705 net.go:770] primary dev: ETH0
I0319 20:06:43.423203 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:06:43.423231 543705 net.go:698] Add success.
I0319 20:06:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:06:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:06:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:06:53.409803 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:06:53.409828 543705 memory.go:184] no items to output this cycle
I0319 20:06:53.409832 543705 cpu.go:275] no items to output this cycle
E0319 20:07:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:07:03.409781 543705 memory.go:184] no items to output this cycle
I0319 20:07:03.409827 543705 cpu.go:275] no items to output this cycle
E0319 20:07:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:07:13.409828 543705 memory.go:191] Add success.
I0319 20:07:13.409832 543705 cpu.go:282] Add success.
W0319 20:07:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:07:13.409875 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:07:13.409879 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:07:13.420209 543705 net.go:648] Add success.
I0319 20:07:13.423330 543705 net.go:770] primary dev: ETH0
I0319 20:07:13.423345 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:07:13.423359 543705 net.go:698] Add success.
I0319 20:07:13.452771 543705 event_worker.go:152] Polling the log file for events...
W0319 20:07:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:07:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0319 20:07:14.455184 543705 disk_worker.go:728] disk inode is not compliant
E0319 20:07:14.455906 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 20:07:14.455914 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 20:07:14.455920 543705 custom_config.go:64] query custom config with name: gpu
I0319 20:07:14.456532 543705 disk_worker.go:494] system disk:vda1
I0319 20:07:14.456561 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 20:07:15.456839 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 20:07:15.456848 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:07:16.457957 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 20:07:16.457965 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 20:07:16.458009 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:07:16.458026 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:07:16.472372 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:07:20.748379 543705 disk_info.go:125] begin check local disk info of client
I0319 20:07:20.750846 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:07:20.750852 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002870c0 0xc000287100]
E0319 20:07:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:07:23.409771 543705 memory.go:184] no items to output this cycle
I0319 20:07:23.409775 543705 cpu.go:275] no items to output this cycle
E0319 20:07:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:07:33.409776 543705 memory.go:184] no items to output this cycle
I0319 20:07:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 20:07:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:07:43.409794 543705 memory.go:191] Add success.
I0319 20:07:43.409805 543705 cpu.go:282] Add success.
I0319 20:07:43.419971 543705 net.go:648] Add success.
I0319 20:07:43.422890 543705 net.go:770] primary dev: ETH0
I0319 20:07:43.422903 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:07:43.422915 543705 net.go:698] Add success.
I0319 20:07:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:07:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:07:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:07:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:07:53.409802 543705 memory.go:184] no items to output this cycle
I0319 20:07:53.409816 543705 cpu.go:275] no items to output this cycle
E0319 20:08:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:08:03.409774 543705 memory.go:184] no items to output this cycle
I0319 20:08:03.409792 543705 cpu.go:275] no items to output this cycle
E0319 20:08:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:08:13.409816 543705 memory.go:191] Add success.
I0319 20:08:13.409826 543705 cpu.go:282] Add success.
W0319 20:08:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:08:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:08:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:08:13.419727 543705 net.go:648] Add success.
I0319 20:08:13.422757 543705 net.go:770] primary dev: ETH0
I0319 20:08:13.422770 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:08:13.422781 543705 net.go:698] Add success.
I0319 20:08:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:08:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:08:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0319 20:08:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:08:14.456564 543705 disk_worker.go:494] system disk:vda1
I0319 20:08:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:08:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:08:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:08:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:08:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:08:16.472416 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:08:20.752393 543705 disk_info.go:125] begin check local disk info of client
I0319 20:08:20.754881 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:08:20.754888 543705 disk_info.go:196] parse disk info done, disk is : [0xc00029c140 0xc00029c180]
E0319 20:08:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:08:23.409783 543705 memory.go:184] no items to output this cycle
I0319 20:08:23.409785 543705 cpu.go:275] no items to output this cycle
E0319 20:08:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:08:33.409790 543705 memory.go:184] no items to output this cycle
I0319 20:08:33.409792 543705 cpu.go:275] no items to output this cycle
E0319 20:08:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:08:43.409799 543705 memory.go:191] Add success.
I0319 20:08:43.409804 543705 cpu.go:282] Add success.
I0319 20:08:43.420075 543705 net.go:648] Add success.
I0319 20:08:43.423082 543705 net.go:770] primary dev: ETH0
I0319 20:08:43.423097 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:08:43.423111 543705 net.go:698] Add success.
I0319 20:08:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:08:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:08:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:08:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:08:53.409788 543705 memory.go:184] no items to output this cycle
I0319 20:08:53.409791 543705 cpu.go:275] no items to output this cycle
E0319 20:09:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:09:03.409798 543705 memory.go:184] no items to output this cycle
I0319 20:09:03.409805 543705 cpu.go:275] no items to output this cycle
E0319 20:09:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:09:13.409824 543705 memory.go:191] Add success.
I0319 20:09:13.409843 543705 cpu.go:282] Add success.
W0319 20:09:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:09:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:09:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:09:13.420429 543705 net.go:648] Add success.
I0319 20:09:13.423256 543705 net.go:770] primary dev: ETH0
I0319 20:09:13.423270 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:09:13.423281 543705 net.go:698] Add success.
I0319 20:09:13.467598 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"28cdf3fe-8784-45d0-b255-a99399018a14","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 20:09:13.467629 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 20:09:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:09:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:09:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0319 20:09:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:09:14.456661 543705 disk_worker.go:494] system disk:vda1
I0319 20:09:14.456691 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:09:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:09:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:09:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:09:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:09:16.472394 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:09:20.756420 543705 disk_info.go:125] begin check local disk info of client
I0319 20:09:20.758955 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:09:20.758962 543705 disk_info.go:196] parse disk info done, disk is : [0xc000594840 0xc0005948c0]
E0319 20:09:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:09:23.409773 543705 memory.go:184] no items to output this cycle
I0319 20:09:23.409793 543705 cpu.go:275] no items to output this cycle
E0319 20:09:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:09:33.409792 543705 memory.go:184] no items to output this cycle
I0319 20:09:33.409796 543705 cpu.go:275] no items to output this cycle
I0319 20:09:37.853725 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 20:09:37.853733 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 20:09:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:09:43.410642 543705 memory.go:191] Add success.
I0319 20:09:43.409808 543705 cpu.go:282] Add success.
I0319 20:09:43.420406 543705 net.go:648] Add success.
I0319 20:09:43.423055 543705 net.go:770] primary dev: ETH0
I0319 20:09:43.423068 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:09:43.423081 543705 net.go:698] Add success.
I0319 20:09:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:09:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:09:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:09:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:09:53.409797 543705 memory.go:184] no items to output this cycle
I0319 20:09:53.409807 543705 cpu.go:275] no items to output this cycle
E0319 20:10:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:10:03.409801 543705 memory.go:184] no items to output this cycle
I0319 20:10:03.409818 543705 cpu.go:275] no items to output this cycle
E0319 20:10:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:10:13.409792 543705 memory.go:191] Add success.
I0319 20:10:13.409792 543705 cpu.go:282] Add success.
W0319 20:10:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:10:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:10:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:10:13.420405 543705 net.go:648] Add success.
I0319 20:10:13.423066 543705 net.go:770] primary dev: ETH0
I0319 20:10:13.423089 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:10:13.423100 543705 net.go:698] Add success.
I0319 20:10:14.454951 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:10:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:10:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0319 20:10:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:10:14.456528 543705 disk_worker.go:494] system disk:vda1
I0319 20:10:14.456558 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:10:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:10:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:10:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:10:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:10:16.472390 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:10:20.760460 543705 disk_info.go:125] begin check local disk info of client
I0319 20:10:20.762925 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:10:20.762932 543705 disk_info.go:196] parse disk info done, disk is : [0xc000367ac0 0xc000367b00]
E0319 20:10:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:10:23.409762 543705 memory.go:184] no items to output this cycle
I0319 20:10:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 20:10:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:10:33.409772 543705 memory.go:184] no items to output this cycle
I0319 20:10:33.409797 543705 cpu.go:275] no items to output this cycle
E0319 20:10:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:10:43.409819 543705 memory.go:191] Add success.
I0319 20:10:43.409829 543705 cpu.go:282] Add success.
I0319 20:10:43.419897 543705 net.go:648] Add success.
I0319 20:10:43.422777 543705 net.go:770] primary dev: ETH0
I0319 20:10:43.422792 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:10:43.422806 543705 net.go:698] Add success.
I0319 20:10:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:10:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:10:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:10:53.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:10:53.409809 543705 memory.go:184] no items to output this cycle
I0319 20:10:53.409817 543705 cpu.go:275] no items to output this cycle
E0319 20:11:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:11:03.409779 543705 memory.go:184] no items to output this cycle
I0319 20:11:03.409784 543705 cpu.go:275] no items to output this cycle
E0319 20:11:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:11:13.409778 543705 memory.go:191] Add success.
W0319 20:11:13.409988 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 20:11:13.409994 543705 cpu.go:282] Add success.
W0319 20:11:13.410002 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:11:13.410005 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:11:13.419731 543705 net.go:648] Add success.
I0319 20:11:13.422651 543705 net.go:770] primary dev: ETH0
I0319 20:11:13.422666 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:11:13.422679 543705 net.go:698] Add success.
I0319 20:11:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:11:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:11:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0319 20:11:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:11:14.456577 543705 disk_worker.go:494] system disk:vda1
I0319 20:11:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:11:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:11:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:11:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:11:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:11:16.472457 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:11:20.763018 543705 disk_info.go:125] begin check local disk info of client
I0319 20:11:20.765523 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:11:20.765530 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004c1ac0 0xc0004c1b00]
E0319 20:11:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:11:23.409777 543705 cpu.go:275] no items to output this cycle
I0319 20:11:23.409787 543705 memory.go:184] no items to output this cycle
E0319 20:11:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:11:33.409775 543705 memory.go:184] no items to output this cycle
I0319 20:11:33.409796 543705 cpu.go:275] no items to output this cycle
E0319 20:11:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:11:43.409805 543705 memory.go:191] Add success.
I0319 20:11:43.409814 543705 cpu.go:282] Add success.
I0319 20:11:43.419994 543705 net.go:648] Add success.
I0319 20:11:43.422667 543705 net.go:770] primary dev: ETH0
I0319 20:11:43.422680 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:11:43.422692 543705 net.go:698] Add success.
I0319 20:11:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:11:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:11:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:11:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:11:53.409805 543705 memory.go:184] no items to output this cycle
I0319 20:11:53.409818 543705 cpu.go:275] no items to output this cycle
E0319 20:12:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:12:03.409783 543705 cpu.go:275] no items to output this cycle
I0319 20:12:03.409792 543705 memory.go:184] no items to output this cycle
E0319 20:12:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:12:13.409890 543705 cpu.go:282] Add success.
I0319 20:12:13.409907 543705 memory.go:191] Add success.
W0319 20:12:13.409936 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:12:13.409958 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:12:13.409963 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:12:13.419729 543705 net.go:648] Add success.
I0319 20:12:13.422520 543705 net.go:770] primary dev: ETH0
I0319 20:12:13.422532 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:12:13.422546 543705 net.go:698] Add success.
I0319 20:12:13.469063 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"43d8c194-ead7-46bd-ad49-c7819c8fa946","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 20:12:13.469091 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 20:12:14.455154 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:12:14.455164 543705 disk_worker.go:708] disk space is not compliant
W0319 20:12:14.455167 543705 disk_worker.go:728] disk inode is not compliant
E0319 20:12:14.455998 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 20:12:14.456007 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 20:12:14.456012 543705 custom_config.go:64] query custom config with name: gpu
I0319 20:12:14.456447 543705 disk_worker.go:494] system disk:vda1
I0319 20:12:14.456475 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 20:12:15.456838 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 20:12:15.456846 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:12:16.457931 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 20:12:16.457931 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 20:12:16.457987 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:12:16.458006 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:12:16.472330 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:12:20.765674 543705 disk_info.go:125] begin check local disk info of client
I0319 20:12:20.768081 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:12:20.768087 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b1400 0xc0003b1440]
E0319 20:12:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:12:23.409791 543705 memory.go:184] no items to output this cycle
I0319 20:12:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 20:12:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:12:33.409778 543705 memory.go:184] no items to output this cycle
I0319 20:12:33.409796 543705 cpu.go:275] no items to output this cycle
I0319 20:12:37.855970 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 20:12:37.855977 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 20:12:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:12:43.410655 543705 memory.go:191] Add success.
I0319 20:12:43.409825 543705 cpu.go:282] Add success.
I0319 20:12:43.420362 543705 net.go:648] Add success.
I0319 20:12:43.422931 543705 net.go:770] primary dev: ETH0
I0319 20:12:43.422951 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:12:43.422966 543705 net.go:698] Add success.
I0319 20:12:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:12:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:12:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:12:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:12:53.409789 543705 memory.go:184] no items to output this cycle
I0319 20:12:53.409797 543705 cpu.go:275] no items to output this cycle
E0319 20:13:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:13:03.409799 543705 memory.go:184] no items to output this cycle
I0319 20:13:03.409811 543705 cpu.go:275] no items to output this cycle
E0319 20:13:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:13:13.409820 543705 memory.go:191] Add success.
I0319 20:13:13.409828 543705 cpu.go:282] Add success.
W0319 20:13:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:13:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:13:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:13:13.420145 543705 net.go:648] Add success.
I0319 20:13:13.423130 543705 net.go:770] primary dev: ETH0
I0319 20:13:13.423145 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:13:13.423159 543705 net.go:698] Add success.
I0319 20:13:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:13:14.455135 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:13:14.455220 543705 disk_worker.go:708] disk space is not compliant
W0319 20:13:14.455224 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:13:14.456637 543705 disk_worker.go:494] system disk:vda1
I0319 20:13:14.456667 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:13:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:13:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:13:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:13:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:13:16.472404 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:13:20.768175 543705 disk_info.go:125] begin check local disk info of client
I0319 20:13:20.770677 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:13:20.770683 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001abcc0 0xc0001abd00]
E0319 20:13:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:13:23.409799 543705 memory.go:184] no items to output this cycle
I0319 20:13:23.409806 543705 cpu.go:275] no items to output this cycle
E0319 20:13:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:13:33.409786 543705 memory.go:184] no items to output this cycle
I0319 20:13:33.409785 543705 cpu.go:275] no items to output this cycle
E0319 20:13:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:13:43.409814 543705 memory.go:191] Add success.
I0319 20:13:43.409826 543705 cpu.go:282] Add success.
I0319 20:13:43.419916 543705 net.go:648] Add success.
I0319 20:13:43.422875 543705 net.go:770] primary dev: ETH0
I0319 20:13:43.422890 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:13:43.422904 543705 net.go:698] Add success.
I0319 20:13:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:13:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:13:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:13:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:13:53.409777 543705 cpu.go:275] no items to output this cycle
I0319 20:13:53.409781 543705 memory.go:184] no items to output this cycle
E0319 20:14:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:14:03.409778 543705 memory.go:184] no items to output this cycle
I0319 20:14:03.409807 543705 cpu.go:275] no items to output this cycle
E0319 20:14:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:14:13.409790 543705 memory.go:191] Add success.
I0319 20:14:13.409798 543705 cpu.go:282] Add success.
W0319 20:14:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:14:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:14:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:14:13.420222 543705 net.go:648] Add success.
I0319 20:14:13.423118 543705 net.go:770] primary dev: ETH0
I0319 20:14:13.423132 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:14:13.423144 543705 net.go:698] Add success.
I0319 20:14:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:14:14.455162 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:14:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0319 20:14:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:14:14.456518 543705 disk_worker.go:494] system disk:vda1
I0319 20:14:14.456563 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:14:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:14:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:14:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:14:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:14:16.472362 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:14:20.772500 543705 disk_info.go:125] begin check local disk info of client
I0319 20:14:20.774994 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:14:20.775002 543705 disk_info.go:196] parse disk info done, disk is : [0xc000307bc0 0xc000307c00]
E0319 20:14:23.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:14:23.409757 543705 memory.go:184] no items to output this cycle
I0319 20:14:23.409791 543705 cpu.go:275] no items to output this cycle
E0319 20:14:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:14:33.409804 543705 memory.go:184] no items to output this cycle
I0319 20:14:33.409817 543705 cpu.go:275] no items to output this cycle
E0319 20:14:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:14:43.409780 543705 memory.go:191] Add success.
I0319 20:14:43.409816 543705 cpu.go:282] Add success.
I0319 20:14:43.420006 543705 net.go:648] Add success.
I0319 20:14:43.423264 543705 net.go:770] primary dev: ETH0
I0319 20:14:43.423277 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:14:43.423290 543705 net.go:698] Add success.
I0319 20:14:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:14:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:14:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:14:53.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:14:53.409815 543705 memory.go:184] no items to output this cycle
I0319 20:14:53.409819 543705 cpu.go:275] no items to output this cycle
E0319 20:15:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:15:03.409784 543705 memory.go:184] no items to output this cycle
I0319 20:15:03.409788 543705 cpu.go:275] no items to output this cycle
E0319 20:15:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:15:13.409795 543705 memory.go:191] Add success.
I0319 20:15:13.409795 543705 cpu.go:282] Add success.
W0319 20:15:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:15:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:15:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:15:13.420165 543705 net.go:648] Add success.
I0319 20:15:13.422795 543705 net.go:770] primary dev: ETH0
I0319 20:15:13.422811 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:15:13.422825 543705 net.go:698] Add success.
I0319 20:15:13.499598 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"11f8b70a-6038-4def-bd24-18a2c32f4198","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 20:15:13.499632 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 20:15:14.454955 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:15:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:15:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0319 20:15:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:15:14.456673 543705 disk_worker.go:494] system disk:vda1
I0319 20:15:14.456702 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:15:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:15:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:15:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:15:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:15:16.472414 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:15:20.775084 543705 disk_info.go:125] begin check local disk info of client
I0319 20:15:20.777587 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:15:20.777593 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007bec0 0xc00007bf00]
E0319 20:15:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:15:23.409768 543705 memory.go:184] no items to output this cycle
I0319 20:15:23.409794 543705 cpu.go:275] no items to output this cycle
E0319 20:15:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:15:33.409774 543705 memory.go:184] no items to output this cycle
I0319 20:15:33.409798 543705 cpu.go:275] no items to output this cycle
I0319 20:15:37.857739 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 20:15:37.857745 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 20:15:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:15:43.410588 543705 memory.go:191] Add success.
I0319 20:15:43.409801 543705 cpu.go:282] Add success.
I0319 20:15:43.420299 543705 net.go:648] Add success.
I0319 20:15:43.422826 543705 net.go:770] primary dev: ETH0
I0319 20:15:43.422842 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:15:43.422856 543705 net.go:698] Add success.
I0319 20:15:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:15:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:15:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:15:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:15:53.409896 543705 memory.go:184] no items to output this cycle
I0319 20:15:53.409925 543705 cpu.go:275] no items to output this cycle
E0319 20:16:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:16:03.409803 543705 memory.go:184] no items to output this cycle
I0319 20:16:03.409813 543705 cpu.go:275] no items to output this cycle
E0319 20:16:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:16:13.409789 543705 memory.go:191] Add success.
W0319 20:16:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 20:16:13.409819 543705 cpu.go:282] Add success.
W0319 20:16:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:16:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:16:13.420174 543705 net.go:648] Add success.
I0319 20:16:13.422818 543705 net.go:770] primary dev: ETH0
I0319 20:16:13.422830 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:16:13.422843 543705 net.go:698] Add success.
I0319 20:16:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:16:14.455092 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:16:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0319 20:16:14.455179 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:16:14.456561 543705 disk_worker.go:494] system disk:vda1
I0319 20:16:14.456589 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:16:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:16:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:16:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:16:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:16:16.472408 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:16:20.777685 543705 disk_info.go:125] begin check local disk info of client
I0319 20:16:20.780148 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:16:20.780154 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b580 0xc00007b5c0]
E0319 20:16:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:16:23.409800 543705 memory.go:184] no items to output this cycle
I0319 20:16:23.409814 543705 cpu.go:275] no items to output this cycle
E0319 20:16:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:16:33.409789 543705 memory.go:184] no items to output this cycle
I0319 20:16:33.409809 543705 cpu.go:275] no items to output this cycle
E0319 20:16:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:16:43.409793 543705 memory.go:191] Add success.
I0319 20:16:43.409810 543705 cpu.go:282] Add success.
I0319 20:16:43.420073 543705 net.go:648] Add success.
I0319 20:16:43.422721 543705 net.go:770] primary dev: ETH0
I0319 20:16:43.422734 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:16:43.422745 543705 net.go:698] Add success.
I0319 20:16:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:16:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:16:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:16:53.409810 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:16:53.409832 543705 memory.go:184] no items to output this cycle
I0319 20:16:53.409842 543705 cpu.go:275] no items to output this cycle
E0319 20:17:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:17:03.409778 543705 memory.go:184] no items to output this cycle
I0319 20:17:03.409799 543705 cpu.go:275] no items to output this cycle
E0319 20:17:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:17:13.409819 543705 memory.go:191] Add success.
I0319 20:17:13.409824 543705 cpu.go:282] Add success.
W0319 20:17:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:17:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:17:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:17:13.420089 543705 net.go:648] Add success.
I0319 20:17:13.422965 543705 net.go:770] primary dev: ETH0
I0319 20:17:13.422982 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:17:13.422994 543705 net.go:698] Add success.
I0319 20:17:13.453669 543705 event_worker.go:152] Polling the log file for events...
W0319 20:17:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:17:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0319 20:17:14.455173 543705 disk_worker.go:728] disk inode is not compliant
E0319 20:17:14.456927 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 20:17:14.456937 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 20:17:14.456942 543705 custom_config.go:64] query custom config with name: gpu
I0319 20:17:14.456986 543705 disk_worker.go:494] system disk:vda1
I0319 20:17:14.457017 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 20:17:15.456861 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 20:17:15.456870 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:17:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 20:17:16.457990 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 20:17:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:17:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:17:16.472406 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:17:20.781538 543705 disk_info.go:125] begin check local disk info of client
I0319 20:17:20.783972 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:17:20.783979 543705 disk_info.go:196] parse disk info done, disk is : [0xc000364280 0xc0003642c0]
E0319 20:17:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:17:23.409792 543705 memory.go:184] no items to output this cycle
I0319 20:17:23.409806 543705 cpu.go:275] no items to output this cycle
E0319 20:17:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:17:33.409794 543705 memory.go:184] no items to output this cycle
I0319 20:17:33.409797 543705 cpu.go:275] no items to output this cycle
E0319 20:17:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:17:43.409806 543705 memory.go:191] Add success.
I0319 20:17:43.409806 543705 cpu.go:282] Add success.
I0319 20:17:43.420033 543705 net.go:648] Add success.
I0319 20:17:43.422925 543705 net.go:770] primary dev: ETH0
I0319 20:17:43.422938 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:17:43.422949 543705 net.go:698] Add success.
I0319 20:17:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:17:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:17:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:17:53.410250 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:17:53.410268 543705 memory.go:184] no items to output this cycle
I0319 20:17:53.410281 543705 cpu.go:275] no items to output this cycle
E0319 20:18:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:18:03.409777 543705 memory.go:184] no items to output this cycle
I0319 20:18:03.409807 543705 cpu.go:275] no items to output this cycle
E0319 20:18:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:18:13.409810 543705 memory.go:191] Add success.
I0319 20:18:13.409819 543705 cpu.go:282] Add success.
W0319 20:18:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:18:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:18:13.409855 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:18:13.420138 543705 net.go:648] Add success.
I0319 20:18:13.422866 543705 net.go:770] primary dev: ETH0
I0319 20:18:13.422880 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:18:13.422893 543705 net.go:698] Add success.
I0319 20:18:13.543210 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"feeac589-25f5-452e-a117-f97809e3f46f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 20:18:13.543245 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 20:18:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:18:14.455209 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:18:14.455222 543705 disk_worker.go:708] disk space is not compliant
W0319 20:18:14.455224 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:18:14.456663 543705 disk_worker.go:494] system disk:vda1
I0319 20:18:14.456692 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:18:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:18:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:18:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:18:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:18:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:18:20.784063 543705 disk_info.go:125] begin check local disk info of client
I0319 20:18:20.786575 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:18:20.786582 543705 disk_info.go:196] parse disk info done, disk is : [0xc000397940 0xc000397980]
E0319 20:18:23.410100 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:18:23.410121 543705 memory.go:184] no items to output this cycle
I0319 20:18:23.410134 543705 cpu.go:275] no items to output this cycle
E0319 20:18:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:18:33.409794 543705 memory.go:184] no items to output this cycle
I0319 20:18:33.409804 543705 cpu.go:275] no items to output this cycle
I0319 20:18:37.859993 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 20:18:37.860000 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 20:18:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:18:43.410775 543705 memory.go:191] Add success.
I0319 20:18:43.409826 543705 cpu.go:282] Add success.
I0319 20:18:43.420891 543705 net.go:648] Add success.
I0319 20:18:43.423924 543705 net.go:770] primary dev: ETH0
I0319 20:18:43.423938 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:18:43.423950 543705 net.go:698] Add success.
I0319 20:18:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:18:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:18:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:18:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:18:53.409786 543705 memory.go:184] no items to output this cycle
I0319 20:18:53.409810 543705 cpu.go:275] no items to output this cycle
E0319 20:19:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:19:03.409783 543705 cpu.go:275] no items to output this cycle
I0319 20:19:03.409791 543705 memory.go:184] no items to output this cycle
E0319 20:19:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:19:13.409814 543705 memory.go:191] Add success.
I0319 20:19:13.409819 543705 cpu.go:282] Add success.
W0319 20:19:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:19:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:19:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:19:13.420217 543705 net.go:648] Add success.
I0319 20:19:13.422980 543705 net.go:770] primary dev: ETH0
I0319 20:19:13.422995 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:19:13.423007 543705 net.go:698] Add success.
I0319 20:19:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:19:14.455151 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:19:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0319 20:19:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:19:14.456502 543705 disk_worker.go:494] system disk:vda1
I0319 20:19:14.456549 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:19:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:19:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:19:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:19:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:19:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:19:20.787570 543705 disk_info.go:125] begin check local disk info of client
I0319 20:19:20.790031 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:19:20.790037 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047d700 0xc00047d740]
E0319 20:19:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:19:23.409767 543705 memory.go:184] no items to output this cycle
I0319 20:19:23.409786 543705 cpu.go:275] no items to output this cycle
E0319 20:19:33.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:19:33.409812 543705 memory.go:184] no items to output this cycle
I0319 20:19:33.409823 543705 cpu.go:275] no items to output this cycle
E0319 20:19:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:19:43.409788 543705 memory.go:191] Add success.
I0319 20:19:43.409811 543705 cpu.go:282] Add success.
I0319 20:19:43.419956 543705 net.go:648] Add success.
I0319 20:19:43.422529 543705 net.go:770] primary dev: ETH0
I0319 20:19:43.422543 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:19:43.422555 543705 net.go:698] Add success.
I0319 20:19:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:19:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:19:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:19:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:19:53.409767 543705 memory.go:184] no items to output this cycle
I0319 20:19:53.409792 543705 cpu.go:275] no items to output this cycle
E0319 20:20:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:20:03.409780 543705 memory.go:184] no items to output this cycle
I0319 20:20:03.409800 543705 cpu.go:275] no items to output this cycle
E0319 20:20:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:20:13.409811 543705 memory.go:191] Add success.
I0319 20:20:13.409818 543705 cpu.go:282] Add success.
W0319 20:20:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:20:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:20:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:20:13.420163 543705 net.go:648] Add success.
I0319 20:20:13.423292 543705 net.go:770] primary dev: ETH0
I0319 20:20:13.423306 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:20:13.423318 543705 net.go:698] Add success.
I0319 20:20:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:20:14.455164 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:20:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0319 20:20:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:20:14.456502 543705 disk_worker.go:494] system disk:vda1
I0319 20:20:14.456546 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:20:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:20:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:20:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:20:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:20:16.472394 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:20:20.790120 543705 disk_info.go:125] begin check local disk info of client
I0319 20:20:20.792826 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:20:20.792834 543705 disk_info.go:196] parse disk info done, disk is : [0xc000343c80 0xc000343cc0]
E0319 20:20:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:20:23.409764 543705 memory.go:184] no items to output this cycle
I0319 20:20:23.409795 543705 cpu.go:275] no items to output this cycle
E0319 20:20:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:20:33.409925 543705 cpu.go:275] no items to output this cycle
I0319 20:20:33.409952 543705 memory.go:184] no items to output this cycle
E0319 20:20:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:20:43.409811 543705 memory.go:191] Add success.
I0319 20:20:43.409825 543705 cpu.go:282] Add success.
I0319 20:20:43.419979 543705 net.go:648] Add success.
I0319 20:20:43.422656 543705 net.go:770] primary dev: ETH0
I0319 20:20:43.422668 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:20:43.422681 543705 net.go:698] Add success.
I0319 20:20:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:20:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:20:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:20:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:20:53.409798 543705 memory.go:184] no items to output this cycle
I0319 20:20:53.409800 543705 cpu.go:275] no items to output this cycle
E0319 20:21:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:21:03.409800 543705 memory.go:184] no items to output this cycle
I0319 20:21:03.409812 543705 cpu.go:275] no items to output this cycle
E0319 20:21:13.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:21:13.409777 543705 memory.go:191] Add success.
W0319 20:21:13.409802 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 20:21:13.409809 543705 cpu.go:282] Add success.
W0319 20:21:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:21:13.409816 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:21:13.420337 543705 net.go:648] Add success.
I0319 20:21:13.423279 543705 net.go:770] primary dev: ETH0
I0319 20:21:13.423293 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:21:13.423306 543705 net.go:698] Add success.
I0319 20:21:13.572278 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"dbf7184a-6820-4adc-8e39-c79c67f61050","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 20:21:13.572313 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 20:21:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:21:14.455205 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:21:14.455215 543705 disk_worker.go:708] disk space is not compliant
W0319 20:21:14.455217 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:21:14.456604 543705 disk_worker.go:494] system disk:vda1
I0319 20:21:14.456634 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:21:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:21:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:21:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:21:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:21:16.472416 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:21:20.793675 543705 disk_info.go:125] begin check local disk info of client
I0319 20:21:20.796117 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:21:20.796123 543705 disk_info.go:196] parse disk info done, disk is : [0xc00027a2c0 0xc00027a300]
E0319 20:21:23.410362 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:21:23.410382 543705 memory.go:184] no items to output this cycle
I0319 20:21:23.410393 543705 cpu.go:275] no items to output this cycle
E0319 20:21:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:21:33.409810 543705 memory.go:184] no items to output this cycle
I0319 20:21:33.409821 543705 cpu.go:275] no items to output this cycle
I0319 20:21:37.861738 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 20:21:37.861745 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 20:21:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:21:43.410605 543705 memory.go:191] Add success.
I0319 20:21:43.409802 543705 cpu.go:282] Add success.
I0319 20:21:43.420319 543705 net.go:648] Add success.
I0319 20:21:43.423227 543705 net.go:770] primary dev: ETH0
I0319 20:21:43.423245 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:21:43.423265 543705 net.go:698] Add success.
I0319 20:21:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:21:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:21:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:21:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:21:53.409782 543705 memory.go:184] no items to output this cycle
I0319 20:21:53.409786 543705 cpu.go:275] no items to output this cycle
E0319 20:22:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:22:03.409781 543705 memory.go:184] no items to output this cycle
I0319 20:22:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 20:22:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:22:13.409807 543705 memory.go:191] Add success.
I0319 20:22:13.409818 543705 cpu.go:282] Add success.
W0319 20:22:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:22:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:22:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:22:13.420077 543705 net.go:648] Add success.
I0319 20:22:13.422917 543705 net.go:770] primary dev: ETH0
I0319 20:22:13.422932 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:22:13.422946 543705 net.go:698] Add success.
W0319 20:22:14.455165 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:22:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0319 20:22:14.455182 543705 disk_worker.go:728] disk inode is not compliant
E0319 20:22:14.455869 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 20:22:14.455877 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 20:22:14.455883 543705 custom_config.go:64] query custom config with name: gpu
I0319 20:22:14.456542 543705 disk_worker.go:494] system disk:vda1
I0319 20:22:14.456571 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 20:22:15.456836 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 20:22:15.456844 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:22:16.457932 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 20:22:16.457932 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 20:22:16.457985 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:22:16.458005 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:22:16.472334 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:22:20.796206 543705 disk_info.go:125] begin check local disk info of client
I0319 20:22:20.798659 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:22:20.798665 543705 disk_info.go:196] parse disk info done, disk is : [0xc000460a80 0xc000460ac0]
E0319 20:22:23.409902 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:22:23.409916 543705 cpu.go:275] no items to output this cycle
I0319 20:22:23.409931 543705 memory.go:184] no items to output this cycle
E0319 20:22:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:22:33.409784 543705 memory.go:184] no items to output this cycle
I0319 20:22:33.409806 543705 cpu.go:275] no items to output this cycle
E0319 20:22:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:22:43.409790 543705 memory.go:191] Add success.
I0319 20:22:43.409808 543705 cpu.go:282] Add success.
I0319 20:22:43.420040 543705 net.go:648] Add success.
I0319 20:22:43.422834 543705 net.go:770] primary dev: ETH0
I0319 20:22:43.422847 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:22:43.422860 543705 net.go:698] Add success.
I0319 20:22:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:22:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:22:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:22:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:22:53.409790 543705 memory.go:184] no items to output this cycle
I0319 20:22:53.409792 543705 cpu.go:275] no items to output this cycle
E0319 20:23:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:23:03.409771 543705 memory.go:184] no items to output this cycle
I0319 20:23:03.409801 543705 cpu.go:275] no items to output this cycle
E0319 20:23:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:23:13.409813 543705 memory.go:191] Add success.
I0319 20:23:13.409821 543705 cpu.go:282] Add success.
W0319 20:23:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:23:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:23:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:23:13.420259 543705 net.go:648] Add success.
I0319 20:23:13.423066 543705 net.go:770] primary dev: ETH0
I0319 20:23:13.423079 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:23:13.423091 543705 net.go:698] Add success.
I0319 20:23:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:23:14.455187 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:23:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0319 20:23:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:23:14.456577 543705 disk_worker.go:494] system disk:vda1
I0319 20:23:14.456608 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:23:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:23:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:23:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:23:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:23:16.472401 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:23:20.798753 543705 disk_info.go:125] begin check local disk info of client
I0319 20:23:20.801281 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:23:20.801287 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa100 0xc0001aa140]
E0319 20:23:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:23:23.409804 543705 memory.go:184] no items to output this cycle
I0319 20:23:23.409815 543705 cpu.go:275] no items to output this cycle
E0319 20:23:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:23:33.409780 543705 memory.go:184] no items to output this cycle
I0319 20:23:33.409799 543705 cpu.go:275] no items to output this cycle
E0319 20:23:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:23:43.409818 543705 memory.go:191] Add success.
I0319 20:23:43.409822 543705 cpu.go:282] Add success.
I0319 20:23:43.419921 543705 net.go:648] Add success.
I0319 20:23:43.422558 543705 net.go:770] primary dev: ETH0
I0319 20:23:43.422572 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:23:43.422587 543705 net.go:698] Add success.
I0319 20:23:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:23:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:23:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:23:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:23:53.409804 543705 memory.go:184] no items to output this cycle
I0319 20:23:53.409815 543705 cpu.go:275] no items to output this cycle
E0319 20:24:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:24:03.409801 543705 memory.go:184] no items to output this cycle
I0319 20:24:03.409815 543705 cpu.go:275] no items to output this cycle
E0319 20:24:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:24:13.409787 543705 memory.go:191] Add success.
I0319 20:24:13.409790 543705 cpu.go:282] Add success.
W0319 20:24:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:24:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:24:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:24:13.420062 543705 net.go:648] Add success.
I0319 20:24:13.423075 543705 net.go:770] primary dev: ETH0
I0319 20:24:13.423088 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:24:13.423100 543705 net.go:698] Add success.
I0319 20:24:13.633015 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"175d99fd-f1b9-4b7e-8232-e39e35be610b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 20:24:13.633051 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 20:24:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:24:14.455137 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:24:14.455148 543705 disk_worker.go:708] disk space is not compliant
W0319 20:24:14.455151 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:24:14.456497 543705 disk_worker.go:494] system disk:vda1
I0319 20:24:14.456538 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:24:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:24:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:24:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:24:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:24:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:24:20.801682 543705 disk_info.go:125] begin check local disk info of client
I0319 20:24:20.804080 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:24:20.804087 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039c3c0 0xc00039c400]
E0319 20:24:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:24:23.409777 543705 memory.go:184] no items to output this cycle
I0319 20:24:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 20:24:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:24:33.409784 543705 memory.go:184] no items to output this cycle
I0319 20:24:33.409798 543705 cpu.go:275] no items to output this cycle
I0319 20:24:37.864013 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 20:24:37.864020 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 20:24:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:24:43.410644 543705 memory.go:191] Add success.
I0319 20:24:43.409791 543705 cpu.go:282] Add success.
I0319 20:24:43.420350 543705 net.go:648] Add success.
I0319 20:24:43.423010 543705 net.go:770] primary dev: ETH0
I0319 20:24:43.423026 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:24:43.423041 543705 net.go:698] Add success.
I0319 20:24:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:24:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:24:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:24:53.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:24:53.409812 543705 memory.go:184] no items to output this cycle
I0319 20:24:53.409826 543705 cpu.go:275] no items to output this cycle
E0319 20:25:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:25:03.409772 543705 memory.go:184] no items to output this cycle
I0319 20:25:03.409807 543705 cpu.go:275] no items to output this cycle
W0319 20:25:13.409723 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:25:13.409740 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:25:13.409746 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:25:13.409812 543705 cpu.go:282] Add success.
E0319 20:25:13.409823 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:25:13.409846 543705 memory.go:191] Add success.
I0319 20:25:13.420252 543705 net.go:648] Add success.
I0319 20:25:13.423036 543705 net.go:770] primary dev: ETH0
I0319 20:25:13.423051 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:25:13.423065 543705 net.go:698] Add success.
I0319 20:25:14.454980 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:25:14.455213 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:25:14.455224 543705 disk_worker.go:708] disk space is not compliant
W0319 20:25:14.455227 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:25:14.456872 543705 disk_worker.go:494] system disk:vda1
I0319 20:25:14.456906 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:25:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:25:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:25:16.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:25:16.458078 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:25:16.472451 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:25:20.805677 543705 disk_info.go:125] begin check local disk info of client
I0319 20:25:20.808097 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:25:20.808105 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034c140 0xc00034c180]
E0319 20:25:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:25:23.409780 543705 memory.go:184] no items to output this cycle
I0319 20:25:23.409818 543705 cpu.go:275] no items to output this cycle
E0319 20:25:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:25:33.409793 543705 memory.go:184] no items to output this cycle
I0319 20:25:33.409815 543705 cpu.go:275] no items to output this cycle
E0319 20:25:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:25:43.409793 543705 memory.go:191] Add success.
I0319 20:25:43.409816 543705 cpu.go:282] Add success.
I0319 20:25:43.419898 543705 net.go:648] Add success.
I0319 20:25:43.422567 543705 net.go:770] primary dev: ETH0
I0319 20:25:43.422581 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:25:43.422593 543705 net.go:698] Add success.
I0319 20:25:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:25:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:25:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:25:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:25:53.409777 543705 memory.go:184] no items to output this cycle
I0319 20:25:53.409799 543705 cpu.go:275] no items to output this cycle
E0319 20:26:03.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:26:03.409816 543705 memory.go:184] no items to output this cycle
I0319 20:26:03.409826 543705 cpu.go:275] no items to output this cycle
E0319 20:26:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:26:13.409789 543705 memory.go:191] Add success.
I0319 20:26:13.409814 543705 cpu.go:282] Add success.
W0319 20:26:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:26:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:26:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:26:13.420284 543705 net.go:648] Add success.
I0319 20:26:13.422883 543705 net.go:770] primary dev: ETH0
I0319 20:26:13.422895 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:26:13.422907 543705 net.go:698] Add success.
I0319 20:26:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:26:14.455146 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:26:14.455157 543705 disk_worker.go:708] disk space is not compliant
W0319 20:26:14.455159 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:26:14.456463 543705 disk_worker.go:494] system disk:vda1
I0319 20:26:14.456509 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:26:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:26:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:26:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:26:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:26:16.472398 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:26:20.809675 543705 disk_info.go:125] begin check local disk info of client
I0319 20:26:20.812156 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:26:20.812163 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004da100 0xc0004da140]
E0319 20:26:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:26:23.409807 543705 memory.go:184] no items to output this cycle
I0319 20:26:23.409821 543705 cpu.go:275] no items to output this cycle
E0319 20:26:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:26:33.409792 543705 memory.go:184] no items to output this cycle
I0319 20:26:33.409813 543705 cpu.go:275] no items to output this cycle
E0319 20:26:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:26:43.409822 543705 memory.go:191] Add success.
I0319 20:26:43.409835 543705 cpu.go:282] Add success.
I0319 20:26:43.419892 543705 net.go:648] Add success.
I0319 20:26:43.422488 543705 net.go:770] primary dev: ETH0
I0319 20:26:43.422500 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:26:43.422513 543705 net.go:698] Add success.
I0319 20:26:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:26:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:26:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:26:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:26:53.409793 543705 memory.go:184] no items to output this cycle
I0319 20:26:53.409796 543705 cpu.go:275] no items to output this cycle
E0319 20:27:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:27:03.409805 543705 memory.go:184] no items to output this cycle
I0319 20:27:03.409817 543705 cpu.go:275] no items to output this cycle
E0319 20:27:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:27:13.409812 543705 memory.go:191] Add success.
I0319 20:27:13.409820 543705 cpu.go:282] Add success.
W0319 20:27:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:27:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:27:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:27:13.420106 543705 net.go:648] Add success.
I0319 20:27:13.430024 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 20:27:13.430116 543705 net.go:770] primary dev: ETH0
I0319 20:27:13.430133 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:27:13.430147 543705 net.go:698] Add success.
I0319 20:27:13.453662 543705 event_worker.go:152] Polling the log file for events...
I0319 20:27:13.469378 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4011fdef-1b16-4948-b9c1-da30e5883f5e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 20:27:13.469409 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 20:27:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:27:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0319 20:27:14.455191 543705 disk_worker.go:728] disk inode is not compliant
E0319 20:27:14.455940 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 20:27:14.455949 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 20:27:14.455955 543705 custom_config.go:64] query custom config with name: gpu
I0319 20:27:14.456535 543705 disk_worker.go:494] system disk:vda1
I0319 20:27:14.456565 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 20:27:15.456791 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 20:27:15.456800 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:27:16.457928 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 20:27:16.457928 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 20:27:16.457981 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:27:16.458001 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:27:16.472335 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:27:20.813674 543705 disk_info.go:125] begin check local disk info of client
I0319 20:27:20.816155 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:27:20.816162 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034e640 0xc00034e680]
E0319 20:27:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:27:23.409796 543705 memory.go:184] no items to output this cycle
I0319 20:27:23.409810 543705 cpu.go:275] no items to output this cycle
E0319 20:27:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:27:33.409775 543705 memory.go:184] no items to output this cycle
I0319 20:27:33.409807 543705 cpu.go:275] no items to output this cycle
I0319 20:27:37.865744 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 20:27:37.865751 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 20:27:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:27:43.410967 543705 memory.go:191] Add success.
I0319 20:27:43.409825 543705 cpu.go:282] Add success.
I0319 20:27:43.420689 543705 net.go:648] Add success.
I0319 20:27:43.423507 543705 net.go:770] primary dev: ETH0
I0319 20:27:43.423523 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:27:43.423536 543705 net.go:698] Add success.
I0319 20:27:46.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:27:46.458062 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:27:46.458092 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:27:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:27:53.409804 543705 memory.go:184] no items to output this cycle
I0319 20:27:53.409811 543705 cpu.go:275] no items to output this cycle
E0319 20:28:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:28:03.409787 543705 memory.go:184] no items to output this cycle
I0319 20:28:03.409788 543705 cpu.go:275] no items to output this cycle
E0319 20:28:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:28:13.409811 543705 memory.go:191] Add success.
I0319 20:28:13.409822 543705 cpu.go:282] Add success.
W0319 20:28:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:28:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:28:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:28:13.420378 543705 net.go:648] Add success.
I0319 20:28:13.423045 543705 net.go:770] primary dev: ETH0
I0319 20:28:13.423058 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:28:13.423069 543705 net.go:698] Add success.
I0319 20:28:14.453952 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:28:14.455220 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:28:14.455230 543705 disk_worker.go:708] disk space is not compliant
W0319 20:28:14.455233 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:28:14.456605 543705 disk_worker.go:494] system disk:vda1
I0319 20:28:14.456635 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:28:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:28:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:28:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:28:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:28:16.472393 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:28:20.817676 543705 disk_info.go:125] begin check local disk info of client
I0319 20:28:20.820186 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:28:20.820193 543705 disk_info.go:196] parse disk info done, disk is : [0xc00049af40 0xc00049af80]
E0319 20:28:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:28:23.409767 543705 memory.go:184] no items to output this cycle
I0319 20:28:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 20:28:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:28:33.409777 543705 memory.go:184] no items to output this cycle
I0319 20:28:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 20:28:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:28:43.409784 543705 memory.go:191] Add success.
I0319 20:28:43.409815 543705 cpu.go:282] Add success.
I0319 20:28:43.419884 543705 net.go:648] Add success.
I0319 20:28:43.422720 543705 net.go:770] primary dev: ETH0
I0319 20:28:43.422738 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:28:43.422757 543705 net.go:698] Add success.
I0319 20:28:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:28:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:28:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:28:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:28:53.409799 543705 cpu.go:275] no items to output this cycle
I0319 20:28:53.409803 543705 memory.go:184] no items to output this cycle
E0319 20:29:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:29:03.409796 543705 memory.go:184] no items to output this cycle
I0319 20:29:03.409813 543705 cpu.go:275] no items to output this cycle
E0319 20:29:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:29:13.409784 543705 memory.go:191] Add success.
I0319 20:29:13.409805 543705 cpu.go:282] Add success.
W0319 20:29:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:29:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:29:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:29:13.420297 543705 net.go:648] Add success.
I0319 20:29:13.423142 543705 net.go:770] primary dev: ETH0
I0319 20:29:13.423158 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:29:13.423170 543705 net.go:698] Add success.
I0319 20:29:14.454950 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:29:14.455207 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:29:14.455218 543705 disk_worker.go:708] disk space is not compliant
W0319 20:29:14.455221 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:29:14.456623 543705 disk_worker.go:494] system disk:vda1
I0319 20:29:14.456651 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:29:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:29:16.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:29:16.458069 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:29:16.458096 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:29:16.472465 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:29:20.821673 543705 disk_info.go:125] begin check local disk info of client
I0319 20:29:20.824189 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:29:20.824196 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035ef80 0xc00035efc0]
E0319 20:29:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:29:23.409790 543705 memory.go:184] no items to output this cycle
I0319 20:29:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 20:29:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:29:33.409786 543705 memory.go:184] no items to output this cycle
I0319 20:29:33.409789 543705 cpu.go:275] no items to output this cycle
E0319 20:29:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:29:43.409780 543705 memory.go:191] Add success.
I0319 20:29:43.409806 543705 cpu.go:282] Add success.
I0319 20:29:43.419858 543705 net.go:648] Add success.
I0319 20:29:43.422932 543705 net.go:770] primary dev: ETH0
I0319 20:29:43.422944 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:29:43.422957 543705 net.go:698] Add success.
I0319 20:29:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:29:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:29:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:29:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:29:53.409771 543705 memory.go:184] no items to output this cycle
I0319 20:29:53.409800 543705 cpu.go:275] no items to output this cycle
E0319 20:30:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:30:03.409779 543705 memory.go:184] no items to output this cycle
I0319 20:30:03.409785 543705 cpu.go:275] no items to output this cycle
E0319 20:30:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:30:13.409783 543705 memory.go:191] Add success.
I0319 20:30:13.409785 543705 cpu.go:282] Add success.
W0319 20:30:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:30:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:30:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:30:13.420241 543705 net.go:648] Add success.
I0319 20:30:13.423324 543705 net.go:770] primary dev: ETH0
I0319 20:30:13.423339 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:30:13.423352 543705 net.go:698] Add success.
I0319 20:30:13.528837 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"cf6edcf2-b5d8-43ab-ba20-0558804dc274","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 20:30:13.528876 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 20:30:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:30:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:30:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0319 20:30:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:30:14.456577 543705 disk_worker.go:494] system disk:vda1
I0319 20:30:14.456606 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:30:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:30:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:30:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:30:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:30:16.472468 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:30:20.825673 543705 disk_info.go:125] begin check local disk info of client
I0319 20:30:20.828253 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:30:20.828259 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004cc940 0xc0004cc980]
E0319 20:30:23.410505 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:30:23.410524 543705 memory.go:184] no items to output this cycle
I0319 20:30:23.410536 543705 cpu.go:275] no items to output this cycle
E0319 20:30:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:30:33.409788 543705 cpu.go:275] no items to output this cycle
I0319 20:30:33.409801 543705 memory.go:184] no items to output this cycle
I0319 20:30:37.868033 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 20:30:37.868040 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 20:30:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:30:43.410635 543705 memory.go:191] Add success.
I0319 20:30:43.409819 543705 cpu.go:282] Add success.
I0319 20:30:43.420356 543705 net.go:648] Add success.
I0319 20:30:43.423115 543705 net.go:770] primary dev: ETH0
I0319 20:30:43.423129 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:30:43.423144 543705 net.go:698] Add success.
I0319 20:30:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:30:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:30:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:30:53.410254 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:30:53.410272 543705 memory.go:184] no items to output this cycle
I0319 20:30:53.410314 543705 cpu.go:275] no items to output this cycle
E0319 20:31:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:31:03.409769 543705 memory.go:184] no items to output this cycle
I0319 20:31:03.409799 543705 cpu.go:275] no items to output this cycle
E0319 20:31:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:31:13.409827 543705 memory.go:191] Add success.
I0319 20:31:13.409830 543705 cpu.go:282] Add success.
W0319 20:31:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:31:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:31:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:31:13.420298 543705 net.go:648] Add success.
I0319 20:31:13.423128 543705 net.go:770] primary dev: ETH0
I0319 20:31:13.423140 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:31:13.423151 543705 net.go:698] Add success.
I0319 20:31:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:31:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:31:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0319 20:31:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:31:14.456510 543705 disk_worker.go:494] system disk:vda1
I0319 20:31:14.456557 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:31:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:31:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:31:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:31:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:31:16.472417 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:31:20.829673 543705 disk_info.go:125] begin check local disk info of client
I0319 20:31:20.832058 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:31:20.832064 543705 disk_info.go:196] parse disk info done, disk is : [0xc0005425c0 0xc000542600]
E0319 20:31:23.410421 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:31:23.410436 543705 memory.go:184] no items to output this cycle
I0319 20:31:23.410471 543705 cpu.go:275] no items to output this cycle
E0319 20:31:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:31:33.409775 543705 memory.go:184] no items to output this cycle
I0319 20:31:33.409794 543705 cpu.go:275] no items to output this cycle
E0319 20:31:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:31:43.409780 543705 memory.go:191] Add success.
I0319 20:31:43.409811 543705 cpu.go:282] Add success.
I0319 20:31:43.419884 543705 net.go:648] Add success.
I0319 20:31:43.422746 543705 net.go:770] primary dev: ETH0
I0319 20:31:43.422759 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:31:43.422773 543705 net.go:698] Add success.
I0319 20:31:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:31:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:31:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:31:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:31:53.409778 543705 memory.go:184] no items to output this cycle
I0319 20:31:53.409778 543705 cpu.go:275] no items to output this cycle
E0319 20:32:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:32:03.409778 543705 memory.go:184] no items to output this cycle
I0319 20:32:03.409800 543705 cpu.go:275] no items to output this cycle
E0319 20:32:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:32:13.409817 543705 memory.go:191] Add success.
I0319 20:32:13.409819 543705 cpu.go:282] Add success.
W0319 20:32:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:32:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:32:13.409860 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:32:13.419773 543705 net.go:648] Add success.
I0319 20:32:13.422459 543705 net.go:770] primary dev: ETH0
I0319 20:32:13.422472 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:32:13.422482 543705 net.go:698] Add success.
W0319 20:32:14.455103 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:32:14.455164 543705 disk_worker.go:708] disk space is not compliant
W0319 20:32:14.455167 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:32:14.456783 543705 disk_worker.go:494] system disk:vda1
I0319 20:32:14.456821 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 20:32:14.457136 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 20:32:14.457144 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 20:32:14.457148 543705 custom_config.go:64] query custom config with name: gpu
E0319 20:32:15.456848 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 20:32:15.456856 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:32:16.457952 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 20:32:16.457952 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 20:32:16.458007 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:32:16.458026 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:32:16.472358 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:32:20.833673 543705 disk_info.go:125] begin check local disk info of client
I0319 20:32:20.836099 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:32:20.836105 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c48c0 0xc0000c4900]
E0319 20:32:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:32:23.409765 543705 memory.go:184] no items to output this cycle
I0319 20:32:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 20:32:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:32:33.409782 543705 memory.go:184] no items to output this cycle
I0319 20:32:33.409799 543705 cpu.go:275] no items to output this cycle
E0319 20:32:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:32:43.409778 543705 memory.go:191] Add success.
I0319 20:32:43.409801 543705 cpu.go:282] Add success.
I0319 20:32:43.419852 543705 net.go:648] Add success.
I0319 20:32:43.422849 543705 net.go:770] primary dev: ETH0
I0319 20:32:43.422860 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:32:43.422876 543705 net.go:698] Add success.
I0319 20:32:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:32:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:32:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:32:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:32:53.409805 543705 memory.go:184] no items to output this cycle
I0319 20:32:53.409812 543705 cpu.go:275] no items to output this cycle
E0319 20:33:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:33:03.409781 543705 memory.go:184] no items to output this cycle
I0319 20:33:03.409786 543705 cpu.go:275] no items to output this cycle
E0319 20:33:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:33:13.409800 543705 memory.go:191] Add success.
I0319 20:33:13.409817 543705 cpu.go:282] Add success.
W0319 20:33:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:33:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:33:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:33:13.420322 543705 net.go:648] Add success.
I0319 20:33:13.423002 543705 net.go:770] primary dev: ETH0
I0319 20:33:13.423016 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:33:13.423028 543705 net.go:698] Add success.
I0319 20:33:13.483475 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0cc7c0cb-b49c-4348-be69-a14da095cb30","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 20:33:13.483511 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 20:33:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:33:14.455191 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:33:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0319 20:33:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:33:14.456627 543705 disk_worker.go:494] system disk:vda1
I0319 20:33:14.456655 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:33:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:33:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:33:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:33:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:33:16.472373 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:33:20.837677 543705 disk_info.go:125] begin check local disk info of client
I0319 20:33:20.840175 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:33:20.840181 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab600 0xc0001ab640]
E0319 20:33:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:33:23.409769 543705 memory.go:184] no items to output this cycle
I0319 20:33:23.409785 543705 cpu.go:275] no items to output this cycle
E0319 20:33:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:33:33.409810 543705 memory.go:184] no items to output this cycle
I0319 20:33:33.409822 543705 cpu.go:275] no items to output this cycle
I0319 20:33:37.869736 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 20:33:37.869742 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 20:33:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:33:43.410652 543705 memory.go:191] Add success.
I0319 20:33:43.409799 543705 cpu.go:282] Add success.
I0319 20:33:43.420356 543705 net.go:648] Add success.
I0319 20:33:43.423114 543705 net.go:770] primary dev: ETH0
I0319 20:33:43.423128 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:33:43.423140 543705 net.go:698] Add success.
I0319 20:33:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:33:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:33:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:33:53.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:33:53.409814 543705 memory.go:184] no items to output this cycle
I0319 20:33:53.409825 543705 cpu.go:275] no items to output this cycle
E0319 20:34:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:34:03.409809 543705 memory.go:184] no items to output this cycle
I0319 20:34:03.409818 543705 cpu.go:275] no items to output this cycle
E0319 20:34:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:34:13.409803 543705 memory.go:191] Add success.
I0319 20:34:13.409804 543705 cpu.go:282] Add success.
W0319 20:34:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:34:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:34:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:34:13.420166 543705 net.go:648] Add success.
I0319 20:34:13.423030 543705 net.go:770] primary dev: ETH0
I0319 20:34:13.423043 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:34:13.423055 543705 net.go:698] Add success.
I0319 20:34:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:34:14.455188 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:34:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0319 20:34:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:34:14.456592 543705 disk_worker.go:494] system disk:vda1
I0319 20:34:14.456621 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:34:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:34:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:34:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:34:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:34:16.472375 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:34:20.841679 543705 disk_info.go:125] begin check local disk info of client
I0319 20:34:20.844128 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:34:20.844135 543705 disk_info.go:196] parse disk info done, disk is : [0xc000396600 0xc000396640]
E0319 20:34:23.410441 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:34:23.410459 543705 memory.go:184] no items to output this cycle
I0319 20:34:23.410474 543705 cpu.go:275] no items to output this cycle
E0319 20:34:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:34:33.409793 543705 memory.go:184] no items to output this cycle
I0319 20:34:33.409799 543705 cpu.go:275] no items to output this cycle
E0319 20:34:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:34:43.409779 543705 memory.go:191] Add success.
I0319 20:34:43.409811 543705 cpu.go:282] Add success.
I0319 20:34:43.419918 543705 net.go:648] Add success.
I0319 20:34:43.422731 543705 net.go:770] primary dev: ETH0
I0319 20:34:43.422745 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:34:43.422757 543705 net.go:698] Add success.
I0319 20:34:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:34:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:34:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:34:53.409815 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:34:53.409822 543705 cpu.go:275] no items to output this cycle
I0319 20:34:53.409837 543705 memory.go:184] no items to output this cycle
E0319 20:35:03.409898 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:35:03.409906 543705 cpu.go:275] no items to output this cycle
I0319 20:35:03.409920 543705 memory.go:184] no items to output this cycle
E0319 20:35:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:35:13.409819 543705 memory.go:191] Add success.
I0319 20:35:13.409832 543705 cpu.go:282] Add success.
W0319 20:35:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:35:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:35:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:35:13.420137 543705 net.go:648] Add success.
I0319 20:35:13.423118 543705 net.go:770] primary dev: ETH0
I0319 20:35:13.423131 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:35:13.423142 543705 net.go:698] Add success.
I0319 20:35:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:35:14.455098 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:35:14.455159 543705 disk_worker.go:708] disk space is not compliant
W0319 20:35:14.455162 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:35:14.456490 543705 disk_worker.go:494] system disk:vda1
I0319 20:35:14.456534 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:35:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:35:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:35:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:35:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:35:16.472411 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:35:20.845671 543705 disk_info.go:125] begin check local disk info of client
I0319 20:35:20.848103 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:35:20.848109 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003c9280 0xc0003c92c0]
E0319 20:35:23.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:35:23.409760 543705 memory.go:184] no items to output this cycle
I0319 20:35:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 20:35:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:35:33.409815 543705 memory.go:184] no items to output this cycle
I0319 20:35:33.409827 543705 cpu.go:275] no items to output this cycle
E0319 20:35:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:35:43.409810 543705 memory.go:191] Add success.
I0319 20:35:43.409811 543705 cpu.go:282] Add success.
I0319 20:35:43.419892 543705 net.go:648] Add success.
I0319 20:35:43.423036 543705 net.go:770] primary dev: ETH0
I0319 20:35:43.423049 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:35:43.423062 543705 net.go:698] Add success.
I0319 20:35:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:35:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:35:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:35:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:35:53.409786 543705 memory.go:184] no items to output this cycle
I0319 20:35:53.409793 543705 cpu.go:275] no items to output this cycle
E0319 20:36:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:36:03.409804 543705 memory.go:184] no items to output this cycle
I0319 20:36:03.409808 543705 cpu.go:275] no items to output this cycle
E0319 20:36:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:36:13.409806 543705 memory.go:191] Add success.
I0319 20:36:13.409808 543705 cpu.go:282] Add success.
W0319 20:36:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:36:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:36:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:36:13.420044 543705 net.go:648] Add success.
I0319 20:36:13.422638 543705 net.go:770] primary dev: ETH0
I0319 20:36:13.422651 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:36:13.422663 543705 net.go:698] Add success.
I0319 20:36:13.468950 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3dbf2296-79a8-43aa-8d4c-5ce0de3c34f1","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 20:36:13.468984 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 20:36:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:36:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:36:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0319 20:36:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:36:14.456584 543705 disk_worker.go:494] system disk:vda1
I0319 20:36:14.456613 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:36:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:36:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:36:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:36:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:36:16.472364 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:36:20.849671 543705 disk_info.go:125] begin check local disk info of client
I0319 20:36:20.852126 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:36:20.852132 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4d40 0xc0000c4d80]
E0319 20:36:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:36:23.409784 543705 memory.go:184] no items to output this cycle
I0319 20:36:23.409788 543705 cpu.go:275] no items to output this cycle
E0319 20:36:33.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:36:33.409823 543705 memory.go:184] no items to output this cycle
I0319 20:36:33.409838 543705 cpu.go:275] no items to output this cycle
I0319 20:36:37.872050 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 20:36:37.872057 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 20:36:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:36:43.410885 543705 memory.go:191] Add success.
I0319 20:36:43.409804 543705 cpu.go:282] Add success.
I0319 20:36:43.420614 543705 net.go:648] Add success.
I0319 20:36:43.423393 543705 net.go:770] primary dev: ETH0
I0319 20:36:43.423406 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:36:43.423420 543705 net.go:698] Add success.
I0319 20:36:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:36:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:36:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:36:53.409791 543705 cpu.go:275] no items to output this cycle
E0319 20:36:53.409945 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:36:53.409959 543705 memory.go:184] no items to output this cycle
E0319 20:37:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:37:03.409790 543705 cpu.go:275] no items to output this cycle
I0319 20:37:03.409806 543705 memory.go:184] no items to output this cycle
E0319 20:37:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:37:13.409794 543705 memory.go:191] Add success.
I0319 20:37:13.409804 543705 cpu.go:282] Add success.
W0319 20:37:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:37:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:37:13.409851 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:37:13.420114 543705 net.go:648] Add success.
I0319 20:37:13.422805 543705 net.go:770] primary dev: ETH0
I0319 20:37:13.422828 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:37:13.422840 543705 net.go:698] Add success.
I0319 20:37:13.453377 543705 event_worker.go:152] Polling the log file for events...
W0319 20:37:14.455169 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:37:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0319 20:37:14.455185 543705 disk_worker.go:728] disk inode is not compliant
E0319 20:37:14.455880 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 20:37:14.455889 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 20:37:14.455895 543705 custom_config.go:64] query custom config with name: gpu
I0319 20:37:14.456532 543705 disk_worker.go:494] system disk:vda1
I0319 20:37:14.456562 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 20:37:15.456832 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 20:37:15.456840 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:37:16.457918 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 20:37:16.457925 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 20:37:16.457971 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:37:16.457987 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:37:16.472325 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:37:20.853681 543705 disk_info.go:125] begin check local disk info of client
I0319 20:37:20.856053 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:37:20.856059 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007bb40 0xc00007bb80]
E0319 20:37:23.410385 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:37:23.410388 543705 cpu.go:275] no items to output this cycle
I0319 20:37:23.410398 543705 memory.go:184] no items to output this cycle
E0319 20:37:33.409805 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:37:33.409828 543705 memory.go:184] no items to output this cycle
I0319 20:37:33.409839 543705 cpu.go:275] no items to output this cycle
E0319 20:37:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:37:43.409793 543705 memory.go:191] Add success.
I0319 20:37:43.409826 543705 cpu.go:282] Add success.
I0319 20:37:43.419988 543705 net.go:648] Add success.
I0319 20:37:43.423109 543705 net.go:770] primary dev: ETH0
I0319 20:37:43.423121 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:37:43.423134 543705 net.go:698] Add success.
I0319 20:37:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:37:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:37:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:37:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:37:53.409805 543705 memory.go:184] no items to output this cycle
I0319 20:37:53.409817 543705 cpu.go:275] no items to output this cycle
E0319 20:38:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:38:03.409784 543705 memory.go:184] no items to output this cycle
I0319 20:38:03.409813 543705 cpu.go:275] no items to output this cycle
E0319 20:38:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:38:13.409798 543705 cpu.go:282] Add success.
I0319 20:38:13.409802 543705 memory.go:191] Add success.
W0319 20:38:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:38:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:38:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:38:13.420163 543705 net.go:648] Add success.
I0319 20:38:13.422996 543705 net.go:770] primary dev: ETH0
I0319 20:38:13.423009 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:38:13.423022 543705 net.go:698] Add success.
I0319 20:38:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:38:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:38:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0319 20:38:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:38:14.456557 543705 disk_worker.go:494] system disk:vda1
I0319 20:38:14.456588 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:38:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:38:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:38:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:38:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:38:16.472393 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:38:20.857680 543705 disk_info.go:125] begin check local disk info of client
I0319 20:38:20.860185 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:38:20.860191 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003bed00 0xc0003bed40]
E0319 20:38:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:38:23.409800 543705 memory.go:184] no items to output this cycle
I0319 20:38:23.409814 543705 cpu.go:275] no items to output this cycle
E0319 20:38:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:38:33.409788 543705 memory.go:184] no items to output this cycle
I0319 20:38:33.409790 543705 cpu.go:275] no items to output this cycle
E0319 20:38:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:38:43.409788 543705 memory.go:191] Add success.
I0319 20:38:43.409790 543705 cpu.go:282] Add success.
I0319 20:38:43.420125 543705 net.go:648] Add success.
I0319 20:38:43.422973 543705 net.go:770] primary dev: ETH0
I0319 20:38:43.422988 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:38:43.423002 543705 net.go:698] Add success.
I0319 20:38:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:38:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:38:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:38:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:38:53.409907 543705 memory.go:184] no items to output this cycle
I0319 20:38:53.409973 543705 cpu.go:275] no items to output this cycle
E0319 20:39:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:39:03.409812 543705 memory.go:184] no items to output this cycle
I0319 20:39:03.409824 543705 cpu.go:275] no items to output this cycle
E0319 20:39:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:39:13.409813 543705 memory.go:191] Add success.
I0319 20:39:13.409820 543705 cpu.go:282] Add success.
W0319 20:39:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:39:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:39:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:39:13.420148 543705 net.go:648] Add success.
I0319 20:39:13.422994 543705 net.go:770] primary dev: ETH0
I0319 20:39:13.423008 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:39:13.423019 543705 net.go:698] Add success.
I0319 20:39:13.479006 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7c335162-d067-45c9-a42c-3a1702aaea71","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 20:39:13.479041 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 20:39:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:39:14.455138 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:39:14.455223 543705 disk_worker.go:708] disk space is not compliant
W0319 20:39:14.455226 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:39:14.456735 543705 disk_worker.go:494] system disk:vda1
I0319 20:39:14.456767 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:39:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:39:16.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:39:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:39:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:39:16.472400 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:39:20.861673 543705 disk_info.go:125] begin check local disk info of client
I0319 20:39:20.864243 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:39:20.864249 543705 disk_info.go:196] parse disk info done, disk is : [0xc000381f00 0xc000381f40]
E0319 20:39:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:39:23.409781 543705 cpu.go:275] no items to output this cycle
I0319 20:39:23.409783 543705 memory.go:184] no items to output this cycle
E0319 20:39:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:39:33.409818 543705 memory.go:184] no items to output this cycle
I0319 20:39:33.409829 543705 cpu.go:275] no items to output this cycle
I0319 20:39:37.873731 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 20:39:37.873739 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 20:39:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:39:43.410790 543705 memory.go:191] Add success.
I0319 20:39:43.409807 543705 cpu.go:282] Add success.
I0319 20:39:43.419719 543705 net.go:648] Add success.
I0319 20:39:43.422601 543705 net.go:770] primary dev: ETH0
I0319 20:39:43.422615 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:39:43.422630 543705 net.go:698] Add success.
I0319 20:39:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:39:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:39:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:39:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:39:53.409786 543705 memory.go:184] no items to output this cycle
I0319 20:39:53.409787 543705 cpu.go:275] no items to output this cycle
E0319 20:40:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:40:03.409779 543705 memory.go:184] no items to output this cycle
I0319 20:40:03.409785 543705 cpu.go:275] no items to output this cycle
E0319 20:40:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:40:13.409820 543705 memory.go:191] Add success.
I0319 20:40:13.409821 543705 cpu.go:282] Add success.
W0319 20:40:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:40:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:40:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:40:13.420160 543705 net.go:648] Add success.
I0319 20:40:13.423437 543705 net.go:770] primary dev: ETH0
I0319 20:40:13.423450 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:40:13.423461 543705 net.go:698] Add success.
I0319 20:40:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:40:14.455143 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:40:14.455223 543705 disk_worker.go:708] disk space is not compliant
W0319 20:40:14.455226 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:40:14.456614 543705 disk_worker.go:494] system disk:vda1
I0319 20:40:14.456643 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:40:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:40:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:40:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:40:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:40:16.472425 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:40:20.865679 543705 disk_info.go:125] begin check local disk info of client
I0319 20:40:20.868110 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:40:20.868116 543705 disk_info.go:196] parse disk info done, disk is : [0xc000376400 0xc000376440]
E0319 20:40:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:40:23.409794 543705 memory.go:184] no items to output this cycle
I0319 20:40:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 20:40:33.409895 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:40:33.409922 543705 memory.go:184] no items to output this cycle
I0319 20:40:33.410023 543705 cpu.go:275] no items to output this cycle
E0319 20:40:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:40:43.409791 543705 memory.go:191] Add success.
I0319 20:40:43.409806 543705 cpu.go:282] Add success.
I0319 20:40:43.419967 543705 net.go:648] Add success.
I0319 20:40:43.422879 543705 net.go:770] primary dev: ETH0
I0319 20:40:43.422892 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:40:43.422905 543705 net.go:698] Add success.
I0319 20:40:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:40:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:40:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:40:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:40:53.409766 543705 memory.go:184] no items to output this cycle
I0319 20:40:53.409806 543705 cpu.go:275] no items to output this cycle
E0319 20:41:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:41:03.409806 543705 memory.go:184] no items to output this cycle
I0319 20:41:03.409815 543705 cpu.go:275] no items to output this cycle
E0319 20:41:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:41:13.409784 543705 memory.go:191] Add success.
W0319 20:41:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 20:41:13.409811 543705 cpu.go:282] Add success.
W0319 20:41:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:41:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:41:13.420075 543705 net.go:648] Add success.
I0319 20:41:13.422991 543705 net.go:770] primary dev: ETH0
I0319 20:41:13.423005 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:41:13.423016 543705 net.go:698] Add success.
I0319 20:41:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:41:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:41:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0319 20:41:14.455173 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:41:14.456505 543705 disk_worker.go:494] system disk:vda1
I0319 20:41:14.456548 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:41:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:41:16.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:41:16.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:41:16.458086 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:41:16.472430 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:41:20.869670 543705 disk_info.go:125] begin check local disk info of client
I0319 20:41:20.872135 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:41:20.872141 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002be040 0xc0002be080]
E0319 20:41:23.409921 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:41:23.409953 543705 memory.go:184] no items to output this cycle
I0319 20:41:23.409967 543705 cpu.go:275] no items to output this cycle
E0319 20:41:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:41:33.409776 543705 memory.go:184] no items to output this cycle
I0319 20:41:33.409820 543705 cpu.go:275] no items to output this cycle
E0319 20:41:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:41:43.409816 543705 memory.go:191] Add success.
I0319 20:41:43.409816 543705 cpu.go:282] Add success.
I0319 20:41:43.419971 543705 net.go:648] Add success.
I0319 20:41:43.422644 543705 net.go:770] primary dev: ETH0
I0319 20:41:43.422656 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:41:43.422670 543705 net.go:698] Add success.
I0319 20:41:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:41:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:41:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:41:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:41:53.409774 543705 memory.go:184] no items to output this cycle
I0319 20:41:53.409779 543705 cpu.go:275] no items to output this cycle
E0319 20:42:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:42:03.409778 543705 memory.go:184] no items to output this cycle
I0319 20:42:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 20:42:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:42:13.409801 543705 memory.go:191] Add success.
I0319 20:42:13.409808 543705 cpu.go:282] Add success.
W0319 20:42:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:42:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:42:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:42:13.420144 543705 net.go:648] Add success.
I0319 20:42:13.423565 543705 net.go:770] primary dev: ETH0
I0319 20:42:13.423578 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:42:13.423591 543705 net.go:698] Add success.
I0319 20:42:13.463528 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ae08531a-fcd7-44a5-9bd0-f7b230e10e4f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 20:42:13.463560 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 20:42:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:42:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0319 20:42:14.455214 543705 disk_worker.go:728] disk inode is not compliant
E0319 20:42:14.455918 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 20:42:14.455928 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 20:42:14.455934 543705 custom_config.go:64] query custom config with name: gpu
I0319 20:42:14.456598 543705 disk_worker.go:494] system disk:vda1
I0319 20:42:14.456633 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 20:42:15.456843 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 20:42:15.456851 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:42:16.457927 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 20:42:16.457927 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 20:42:16.457982 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:42:16.458003 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:42:16.472336 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:42:20.873675 543705 disk_info.go:125] begin check local disk info of client
I0319 20:42:20.876083 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:42:20.876089 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007a740 0xc00007a780]
E0319 20:42:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:42:23.409783 543705 memory.go:184] no items to output this cycle
I0319 20:42:23.409787 543705 cpu.go:275] no items to output this cycle
E0319 20:42:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:42:33.409777 543705 memory.go:184] no items to output this cycle
I0319 20:42:33.409815 543705 cpu.go:275] no items to output this cycle
I0319 20:42:37.876068 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 20:42:37.876074 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 20:42:43.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:42:43.410646 543705 memory.go:191] Add success.
I0319 20:42:43.409804 543705 cpu.go:282] Add success.
I0319 20:42:43.420406 543705 net.go:648] Add success.
I0319 20:42:43.422949 543705 net.go:770] primary dev: ETH0
I0319 20:42:43.422962 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:42:43.422974 543705 net.go:698] Add success.
I0319 20:42:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:42:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:42:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:42:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:42:53.409780 543705 memory.go:184] no items to output this cycle
I0319 20:42:53.409788 543705 cpu.go:275] no items to output this cycle
E0319 20:43:03.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:43:03.409818 543705 memory.go:184] no items to output this cycle
I0319 20:43:03.409833 543705 cpu.go:275] no items to output this cycle
E0319 20:43:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:43:13.409787 543705 memory.go:191] Add success.
I0319 20:43:13.409788 543705 cpu.go:282] Add success.
W0319 20:43:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:43:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:43:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:43:13.420632 543705 net.go:648] Add success.
I0319 20:43:13.423603 543705 net.go:770] primary dev: ETH0
I0319 20:43:13.423619 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:43:13.423633 543705 net.go:698] Add success.
I0319 20:43:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:43:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:43:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0319 20:43:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:43:14.458980 543705 disk_worker.go:494] system disk:vda1
I0319 20:43:14.459009 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:43:15.456009 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:43:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:43:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:43:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:43:16.472386 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:43:20.877676 543705 disk_info.go:125] begin check local disk info of client
I0319 20:43:20.880165 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:43:20.880172 543705 disk_info.go:196] parse disk info done, disk is : [0xc000289140 0xc000289180]
E0319 20:43:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:43:23.409798 543705 memory.go:184] no items to output this cycle
I0319 20:43:23.409813 543705 cpu.go:275] no items to output this cycle
E0319 20:43:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:43:33.409806 543705 memory.go:184] no items to output this cycle
I0319 20:43:33.409818 543705 cpu.go:275] no items to output this cycle
E0319 20:43:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:43:43.409787 543705 memory.go:191] Add success.
I0319 20:43:43.409807 543705 cpu.go:282] Add success.
I0319 20:43:43.419881 543705 net.go:648] Add success.
I0319 20:43:43.422801 543705 net.go:770] primary dev: ETH0
I0319 20:43:43.422813 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:43:43.422842 543705 net.go:698] Add success.
I0319 20:43:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:43:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:43:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:43:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:43:53.409805 543705 memory.go:184] no items to output this cycle
I0319 20:43:53.409809 543705 cpu.go:275] no items to output this cycle
E0319 20:44:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:44:03.409778 543705 memory.go:184] no items to output this cycle
I0319 20:44:03.409796 543705 cpu.go:275] no items to output this cycle
E0319 20:44:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:44:13.409789 543705 memory.go:191] Add success.
I0319 20:44:13.409789 543705 cpu.go:282] Add success.
W0319 20:44:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:44:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:44:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:44:13.420115 543705 net.go:648] Add success.
I0319 20:44:13.422925 543705 net.go:770] primary dev: ETH0
I0319 20:44:13.422940 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:44:13.423114 543705 net.go:698] Add success.
I0319 20:44:14.454952 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:44:14.455166 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:44:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0319 20:44:14.455179 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:44:14.456577 543705 disk_worker.go:494] system disk:vda1
I0319 20:44:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:44:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:44:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:44:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:44:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:44:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:44:20.881671 543705 disk_info.go:125] begin check local disk info of client
I0319 20:44:20.884198 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:44:20.884205 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003f1400 0xc0003f1440]
E0319 20:44:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:44:23.409779 543705 memory.go:184] no items to output this cycle
I0319 20:44:23.409783 543705 cpu.go:275] no items to output this cycle
E0319 20:44:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:44:33.409785 543705 memory.go:184] no items to output this cycle
I0319 20:44:33.409793 543705 cpu.go:275] no items to output this cycle
E0319 20:44:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:44:43.409796 543705 cpu.go:282] Add success.
I0319 20:44:43.409802 543705 memory.go:191] Add success.
I0319 20:44:43.419828 543705 net.go:648] Add success.
I0319 20:44:43.422481 543705 net.go:770] primary dev: ETH0
I0319 20:44:43.422494 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:44:43.422506 543705 net.go:698] Add success.
I0319 20:44:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:44:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:44:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:44:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:44:53.409796 543705 memory.go:184] no items to output this cycle
I0319 20:44:53.409810 543705 cpu.go:275] no items to output this cycle
E0319 20:45:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:45:03.409809 543705 memory.go:184] no items to output this cycle
I0319 20:45:03.409829 543705 cpu.go:275] no items to output this cycle
E0319 20:45:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:45:13.409782 543705 memory.go:191] Add success.
W0319 20:45:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:45:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:45:13.409819 543705 cpu.go:282] Add success.
I0319 20:45:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:45:13.420259 543705 net.go:648] Add success.
I0319 20:45:13.423209 543705 net.go:770] primary dev: ETH0
I0319 20:45:13.423221 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:45:13.423233 543705 net.go:698] Add success.
I0319 20:45:13.469737 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b3afa715-c1f9-4197-9172-8577a26c655a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 20:45:13.469779 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 20:45:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:45:14.455188 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:45:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0319 20:45:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:45:14.457278 543705 disk_worker.go:494] system disk:vda1
I0319 20:45:14.457384 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:45:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:45:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:45:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:45:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:45:16.472398 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:45:20.885675 543705 disk_info.go:125] begin check local disk info of client
I0319 20:45:20.888129 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:45:20.888136 543705 disk_info.go:196] parse disk info done, disk is : [0xc000386cc0 0xc000386d00]
E0319 20:45:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:45:23.409765 543705 memory.go:184] no items to output this cycle
I0319 20:45:23.409817 543705 cpu.go:275] no items to output this cycle
E0319 20:45:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:45:33.409815 543705 memory.go:184] no items to output this cycle
I0319 20:45:33.409826 543705 cpu.go:275] no items to output this cycle
I0319 20:45:37.877739 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 20:45:37.877746 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 20:45:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:45:43.410662 543705 memory.go:191] Add success.
I0319 20:45:43.409791 543705 cpu.go:282] Add success.
I0319 20:45:43.420363 543705 net.go:648] Add success.
I0319 20:45:43.423099 543705 net.go:770] primary dev: ETH0
I0319 20:45:43.423112 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:45:43.423124 543705 net.go:698] Add success.
I0319 20:45:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:45:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:45:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:45:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:45:53.409776 543705 memory.go:184] no items to output this cycle
I0319 20:45:53.409780 543705 cpu.go:275] no items to output this cycle
E0319 20:46:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:46:03.409779 543705 memory.go:184] no items to output this cycle
I0319 20:46:03.409797 543705 cpu.go:275] no items to output this cycle
E0319 20:46:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:46:13.409812 543705 memory.go:191] Add success.
I0319 20:46:13.409817 543705 cpu.go:282] Add success.
W0319 20:46:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:46:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:46:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:46:13.420144 543705 net.go:648] Add success.
I0319 20:46:13.423108 543705 net.go:770] primary dev: ETH0
I0319 20:46:13.423121 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:46:13.423133 543705 net.go:698] Add success.
I0319 20:46:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:46:14.455094 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:46:14.455156 543705 disk_worker.go:708] disk space is not compliant
W0319 20:46:14.455159 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:46:14.456922 543705 disk_worker.go:494] system disk:vda1
I0319 20:46:14.456952 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:46:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:46:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:46:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:46:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:46:16.472375 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:46:20.889674 543705 disk_info.go:125] begin check local disk info of client
I0319 20:46:20.892114 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:46:20.892120 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003c8300 0xc0003c8340]
E0319 20:46:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:46:23.409775 543705 cpu.go:275] no items to output this cycle
I0319 20:46:23.409779 543705 memory.go:184] no items to output this cycle
E0319 20:46:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:46:33.409804 543705 memory.go:184] no items to output this cycle
I0319 20:46:33.409821 543705 cpu.go:275] no items to output this cycle
E0319 20:46:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:46:43.409782 543705 memory.go:191] Add success.
I0319 20:46:43.409812 543705 cpu.go:282] Add success.
I0319 20:46:43.419875 543705 net.go:648] Add success.
I0319 20:46:43.422802 543705 net.go:770] primary dev: ETH0
I0319 20:46:43.422815 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:46:43.422827 543705 net.go:698] Add success.
I0319 20:46:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:46:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:46:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:46:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:46:53.409767 543705 memory.go:184] no items to output this cycle
I0319 20:46:53.409797 543705 cpu.go:275] no items to output this cycle
E0319 20:47:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:47:03.409783 543705 memory.go:184] no items to output this cycle
I0319 20:47:03.409790 543705 cpu.go:275] no items to output this cycle
E0319 20:47:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:47:13.409799 543705 memory.go:191] Add success.
I0319 20:47:13.409820 543705 cpu.go:282] Add success.
W0319 20:47:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:47:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:47:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:47:13.420111 543705 net.go:648] Add success.
I0319 20:47:13.422835 543705 net.go:770] primary dev: ETH0
I0319 20:47:13.422849 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:47:13.422865 543705 net.go:698] Add success.
I0319 20:47:13.453440 543705 event_worker.go:152] Polling the log file for events...
W0319 20:47:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:47:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0319 20:47:14.455177 543705 disk_worker.go:728] disk inode is not compliant
E0319 20:47:14.456936 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 20:47:14.456945 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 20:47:14.456951 543705 custom_config.go:64] query custom config with name: gpu
I0319 20:47:14.457112 543705 disk_worker.go:494] system disk:vda1
I0319 20:47:14.457148 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 20:47:15.456794 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 20:47:15.456804 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:47:16.457922 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 20:47:16.457923 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 20:47:16.457976 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:47:16.457995 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:47:16.472390 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:47:20.893679 543705 disk_info.go:125] begin check local disk info of client
I0319 20:47:20.896055 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:47:20.896061 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b8340 0xc0002b8380]
E0319 20:47:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:47:23.409800 543705 memory.go:184] no items to output this cycle
I0319 20:47:23.409814 543705 cpu.go:275] no items to output this cycle
E0319 20:47:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:47:33.409789 543705 memory.go:184] no items to output this cycle
I0319 20:47:33.409809 543705 cpu.go:275] no items to output this cycle
E0319 20:47:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:47:43.409825 543705 memory.go:191] Add success.
I0319 20:47:43.409828 543705 cpu.go:282] Add success.
I0319 20:47:43.419960 543705 net.go:648] Add success.
I0319 20:47:43.422590 543705 net.go:770] primary dev: ETH0
I0319 20:47:43.422605 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:47:43.422618 543705 net.go:698] Add success.
I0319 20:47:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:47:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:47:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:47:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:47:53.409777 543705 memory.go:184] no items to output this cycle
I0319 20:47:53.409809 543705 cpu.go:275] no items to output this cycle
E0319 20:48:03.409803 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:48:03.409824 543705 memory.go:184] no items to output this cycle
I0319 20:48:03.409835 543705 cpu.go:275] no items to output this cycle
E0319 20:48:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:48:13.409814 543705 memory.go:191] Add success.
I0319 20:48:13.409821 543705 cpu.go:282] Add success.
W0319 20:48:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:48:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:48:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:48:13.420070 543705 net.go:648] Add success.
I0319 20:48:13.422769 543705 net.go:770] primary dev: ETH0
I0319 20:48:13.422783 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:48:13.422796 543705 net.go:698] Add success.
I0319 20:48:13.469462 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1a396fb9-1edb-420d-b76d-435e1e759b9d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 20:48:13.469495 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 20:48:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:48:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:48:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0319 20:48:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:48:14.456598 543705 disk_worker.go:494] system disk:vda1
I0319 20:48:14.456628 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:48:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:48:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:48:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:48:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:48:16.472355 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:48:20.897672 543705 disk_info.go:125] begin check local disk info of client
I0319 20:48:20.900134 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:48:20.900140 543705 disk_info.go:196] parse disk info done, disk is : [0xc000328340 0xc000328380]
E0319 20:48:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:48:23.409796 543705 memory.go:184] no items to output this cycle
I0319 20:48:23.409811 543705 cpu.go:275] no items to output this cycle
E0319 20:48:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:48:33.409778 543705 memory.go:184] no items to output this cycle
I0319 20:48:33.409801 543705 cpu.go:275] no items to output this cycle
I0319 20:48:37.880093 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 20:48:37.880099 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 20:48:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:48:43.410745 543705 memory.go:191] Add success.
I0319 20:48:43.409826 543705 cpu.go:282] Add success.
I0319 20:48:43.420445 543705 net.go:648] Add success.
I0319 20:48:43.423419 543705 net.go:770] primary dev: ETH0
I0319 20:48:43.423432 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:48:43.423444 543705 net.go:698] Add success.
I0319 20:48:46.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:48:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:48:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:48:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:48:53.409805 543705 memory.go:184] no items to output this cycle
I0319 20:48:53.409814 543705 cpu.go:275] no items to output this cycle
E0319 20:49:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:49:03.409813 543705 memory.go:184] no items to output this cycle
I0319 20:49:03.409829 543705 cpu.go:275] no items to output this cycle
E0319 20:49:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:49:13.409788 543705 memory.go:191] Add success.
I0319 20:49:13.409815 543705 cpu.go:282] Add success.
W0319 20:49:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:49:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:49:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:49:13.420116 543705 net.go:648] Add success.
I0319 20:49:13.423111 543705 net.go:770] primary dev: ETH0
I0319 20:49:13.423126 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:49:13.423140 543705 net.go:698] Add success.
I0319 20:49:14.454980 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:49:14.455128 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:49:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0319 20:49:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:49:14.456590 543705 disk_worker.go:494] system disk:vda1
I0319 20:49:14.456621 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:49:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:49:16.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:49:16.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:49:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:49:16.472394 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:49:20.901675 543705 disk_info.go:125] begin check local disk info of client
I0319 20:49:20.904098 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:49:20.904105 543705 disk_info.go:196] parse disk info done, disk is : [0xc000328100 0xc000328140]
E0319 20:49:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:49:23.409802 543705 memory.go:184] no items to output this cycle
I0319 20:49:23.409814 543705 cpu.go:275] no items to output this cycle
E0319 20:49:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:49:33.409785 543705 memory.go:184] no items to output this cycle
I0319 20:49:33.409830 543705 cpu.go:275] no items to output this cycle
E0319 20:49:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:49:43.409821 543705 memory.go:191] Add success.
I0319 20:49:43.409831 543705 cpu.go:282] Add success.
I0319 20:49:43.419906 543705 net.go:648] Add success.
I0319 20:49:43.423414 543705 net.go:770] primary dev: ETH0
I0319 20:49:43.423428 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:49:43.423440 543705 net.go:698] Add success.
I0319 20:49:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:49:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:49:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:49:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:49:53.409785 543705 memory.go:184] no items to output this cycle
I0319 20:49:53.409786 543705 cpu.go:275] no items to output this cycle
E0319 20:50:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:50:03.409774 543705 memory.go:184] no items to output this cycle
I0319 20:50:03.409818 543705 cpu.go:275] no items to output this cycle
E0319 20:50:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:50:13.409825 543705 memory.go:191] Add success.
I0319 20:50:13.409834 543705 cpu.go:282] Add success.
W0319 20:50:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:50:13.409875 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:50:13.409879 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:50:13.420180 543705 net.go:648] Add success.
I0319 20:50:13.422844 543705 net.go:770] primary dev: ETH0
I0319 20:50:13.422857 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:50:13.422869 543705 net.go:698] Add success.
I0319 20:50:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:50:14.455107 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:50:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0319 20:50:14.455173 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:50:14.456491 543705 disk_worker.go:494] system disk:vda1
I0319 20:50:14.456537 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:50:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:50:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:50:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:50:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:50:16.472403 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:50:20.905672 543705 disk_info.go:125] begin check local disk info of client
I0319 20:50:20.908096 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:50:20.908102 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002ba2c0 0xc0002ba300]
E0319 20:50:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:50:23.409765 543705 memory.go:184] no items to output this cycle
I0319 20:50:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 20:50:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:50:33.409779 543705 memory.go:184] no items to output this cycle
I0319 20:50:33.409800 543705 cpu.go:275] no items to output this cycle
E0319 20:50:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:50:43.409817 543705 memory.go:191] Add success.
I0319 20:50:43.409827 543705 cpu.go:282] Add success.
I0319 20:50:43.420286 543705 net.go:648] Add success.
I0319 20:50:43.423019 543705 net.go:770] primary dev: ETH0
I0319 20:50:43.423032 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:50:43.423045 543705 net.go:698] Add success.
I0319 20:50:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:50:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:50:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:50:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:50:53.409770 543705 memory.go:184] no items to output this cycle
I0319 20:50:53.409805 543705 cpu.go:275] no items to output this cycle
E0319 20:51:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:51:03.409777 543705 memory.go:184] no items to output this cycle
I0319 20:51:03.409812 543705 cpu.go:275] no items to output this cycle
E0319 20:51:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:51:13.409791 543705 cpu.go:282] Add success.
I0319 20:51:13.409793 543705 memory.go:191] Add success.
W0319 20:51:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:51:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:51:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:51:13.420178 543705 net.go:648] Add success.
I0319 20:51:13.423013 543705 net.go:770] primary dev: ETH0
I0319 20:51:13.423026 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:51:13.423038 543705 net.go:698] Add success.
I0319 20:51:13.464390 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"19f5dba8-d152-4ec7-826c-6639d1dd69db","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 20:51:13.464424 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 20:51:14.454978 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:51:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:51:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 20:51:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:51:14.456614 543705 disk_worker.go:494] system disk:vda1
I0319 20:51:14.456644 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:51:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:51:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:51:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:51:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:51:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:51:20.909667 543705 disk_info.go:125] begin check local disk info of client
I0319 20:51:20.912120 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:51:20.912127 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b8ac0 0xc0002b8b00]
E0319 20:51:23.409872 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:51:23.409890 543705 memory.go:184] no items to output this cycle
I0319 20:51:23.410002 543705 cpu.go:275] no items to output this cycle
E0319 20:51:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:51:33.409779 543705 memory.go:184] no items to output this cycle
I0319 20:51:33.409819 543705 cpu.go:275] no items to output this cycle
I0319 20:51:37.881760 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 20:51:37.881768 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 20:51:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:51:43.410813 543705 memory.go:191] Add success.
I0319 20:51:43.409846 543705 cpu.go:282] Add success.
I0319 20:51:43.420615 543705 net.go:648] Add success.
I0319 20:51:43.423351 543705 net.go:770] primary dev: ETH0
I0319 20:51:43.423365 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:51:43.423378 543705 net.go:698] Add success.
I0319 20:51:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:51:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:51:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:51:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:51:53.409804 543705 memory.go:184] no items to output this cycle
I0319 20:51:53.409818 543705 cpu.go:275] no items to output this cycle
E0319 20:52:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:52:03.409780 543705 memory.go:184] no items to output this cycle
I0319 20:52:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 20:52:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:52:13.409791 543705 memory.go:191] Add success.
I0319 20:52:13.409798 543705 cpu.go:282] Add success.
W0319 20:52:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:52:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:52:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:52:13.420059 543705 net.go:648] Add success.
I0319 20:52:13.422918 543705 net.go:770] primary dev: ETH0
I0319 20:52:13.422930 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:52:13.422942 543705 net.go:698] Add success.
W0319 20:52:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:52:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0319 20:52:14.455192 543705 disk_worker.go:728] disk inode is not compliant
E0319 20:52:14.455893 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 20:52:14.455902 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 20:52:14.455908 543705 custom_config.go:64] query custom config with name: gpu
I0319 20:52:14.456554 543705 disk_worker.go:494] system disk:vda1
I0319 20:52:14.456584 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 20:52:15.456814 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 20:52:15.456825 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:52:16.457953 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 20:52:16.457953 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 20:52:16.458023 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:52:16.458042 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:52:16.472376 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:52:20.913677 543705 disk_info.go:125] begin check local disk info of client
I0319 20:52:20.916100 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:52:20.916107 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001f90c0 0xc0001f9100]
E0319 20:52:23.409842 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:52:23.409859 543705 memory.go:184] no items to output this cycle
I0319 20:52:23.409937 543705 cpu.go:275] no items to output this cycle
E0319 20:52:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:52:33.409788 543705 memory.go:184] no items to output this cycle
I0319 20:52:33.409807 543705 cpu.go:275] no items to output this cycle
E0319 20:52:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:52:43.409812 543705 memory.go:191] Add success.
I0319 20:52:43.409817 543705 cpu.go:282] Add success.
I0319 20:52:43.420026 543705 net.go:648] Add success.
I0319 20:52:43.423156 543705 net.go:770] primary dev: ETH0
I0319 20:52:43.423171 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:52:43.423184 543705 net.go:698] Add success.
I0319 20:52:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:52:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:52:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:52:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:52:53.409788 543705 memory.go:184] no items to output this cycle
I0319 20:52:53.409806 543705 cpu.go:275] no items to output this cycle
E0319 20:53:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:53:03.409778 543705 memory.go:184] no items to output this cycle
I0319 20:53:03.409807 543705 cpu.go:275] no items to output this cycle
E0319 20:53:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:53:13.409785 543705 memory.go:191] Add success.
I0319 20:53:13.409801 543705 cpu.go:282] Add success.
W0319 20:53:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:53:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:53:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:53:13.420071 543705 net.go:648] Add success.
I0319 20:53:13.423268 543705 net.go:770] primary dev: ETH0
I0319 20:53:13.423281 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:53:13.423293 543705 net.go:698] Add success.
I0319 20:53:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:53:14.455118 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:53:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 20:53:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:53:14.456594 543705 disk_worker.go:494] system disk:vda1
I0319 20:53:14.456625 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:53:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:53:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:53:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:53:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:53:16.472458 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:53:20.917681 543705 disk_info.go:125] begin check local disk info of client
I0319 20:53:20.920146 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:53:20.920152 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b9e40 0xc0002b9e80]
E0319 20:53:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:53:23.409774 543705 memory.go:184] no items to output this cycle
I0319 20:53:23.409785 543705 cpu.go:275] no items to output this cycle
E0319 20:53:33.409888 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:53:33.409913 543705 memory.go:184] no items to output this cycle
I0319 20:53:33.409960 543705 cpu.go:275] no items to output this cycle
E0319 20:53:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:53:43.409795 543705 memory.go:191] Add success.
I0319 20:53:43.409796 543705 cpu.go:282] Add success.
I0319 20:53:43.419977 543705 net.go:648] Add success.
I0319 20:53:43.423068 543705 net.go:770] primary dev: ETH0
I0319 20:53:43.423083 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:53:43.423110 543705 net.go:698] Add success.
I0319 20:53:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:53:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:53:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:53:53.410397 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:53:53.410419 543705 memory.go:184] no items to output this cycle
I0319 20:53:53.410434 543705 cpu.go:275] no items to output this cycle
E0319 20:54:03.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:54:03.409820 543705 memory.go:184] no items to output this cycle
I0319 20:54:03.409833 543705 cpu.go:275] no items to output this cycle
E0319 20:54:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:54:13.409782 543705 memory.go:191] Add success.
I0319 20:54:13.409800 543705 cpu.go:282] Add success.
W0319 20:54:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:54:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:54:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:54:13.420107 543705 net.go:648] Add success.
I0319 20:54:13.422978 543705 net.go:770] primary dev: ETH0
I0319 20:54:13.422992 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:54:13.423004 543705 net.go:698] Add success.
I0319 20:54:13.464718 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"07ebac0a-678b-46a9-8c7e-a4eb8af144d0","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 20:54:13.464759 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 20:54:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:54:14.455160 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:54:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0319 20:54:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:54:14.456536 543705 disk_worker.go:494] system disk:vda1
I0319 20:54:14.456590 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:54:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:54:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:54:16.458025 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:54:16.458044 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:54:16.472385 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:54:20.921672 543705 disk_info.go:125] begin check local disk info of client
I0319 20:54:20.924096 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:54:20.924103 543705 disk_info.go:196] parse disk info done, disk is : [0xc00028d680 0xc00028d6c0]
E0319 20:54:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:54:23.409786 543705 memory.go:184] no items to output this cycle
I0319 20:54:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 20:54:33.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:54:33.409819 543705 memory.go:184] no items to output this cycle
I0319 20:54:33.409829 543705 cpu.go:275] no items to output this cycle
I0319 20:54:37.884122 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 20:54:37.884128 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 20:54:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:54:43.410575 543705 memory.go:191] Add success.
I0319 20:54:43.409807 543705 cpu.go:282] Add success.
I0319 20:54:43.420290 543705 net.go:648] Add success.
I0319 20:54:43.422891 543705 net.go:770] primary dev: ETH0
I0319 20:54:43.422904 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:54:43.422919 543705 net.go:698] Add success.
I0319 20:54:46.458490 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:54:46.458555 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:54:46.458579 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:54:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:54:53.409777 543705 memory.go:184] no items to output this cycle
I0319 20:54:53.409796 543705 cpu.go:275] no items to output this cycle
E0319 20:55:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:55:03.409784 543705 memory.go:184] no items to output this cycle
I0319 20:55:03.409785 543705 cpu.go:275] no items to output this cycle
E0319 20:55:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:55:13.409788 543705 memory.go:191] Add success.
I0319 20:55:13.409789 543705 cpu.go:282] Add success.
W0319 20:55:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:55:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:55:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:55:13.420069 543705 net.go:648] Add success.
I0319 20:55:13.422918 543705 net.go:770] primary dev: ETH0
I0319 20:55:13.422933 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:55:13.422948 543705 net.go:698] Add success.
I0319 20:55:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:55:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:55:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0319 20:55:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:55:14.456600 543705 disk_worker.go:494] system disk:vda1
I0319 20:55:14.456632 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:55:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:55:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:55:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:55:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:55:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:55:20.925672 543705 disk_info.go:125] begin check local disk info of client
I0319 20:55:20.928075 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:55:20.928081 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039e8c0 0xc00039e900]
E0319 20:55:23.410238 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:55:23.410250 543705 cpu.go:275] no items to output this cycle
I0319 20:55:23.410365 543705 memory.go:184] no items to output this cycle
E0319 20:55:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:55:33.409796 543705 cpu.go:275] no items to output this cycle
I0319 20:55:33.409807 543705 memory.go:184] no items to output this cycle
E0319 20:55:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:55:43.409811 543705 memory.go:191] Add success.
I0319 20:55:43.409825 543705 cpu.go:282] Add success.
I0319 20:55:43.420060 543705 net.go:648] Add success.
I0319 20:55:43.422977 543705 net.go:770] primary dev: ETH0
I0319 20:55:43.422990 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:55:43.423002 543705 net.go:698] Add success.
I0319 20:55:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:55:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:55:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:55:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:55:53.409804 543705 memory.go:184] no items to output this cycle
I0319 20:55:53.409810 543705 cpu.go:275] no items to output this cycle
E0319 20:56:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:56:03.409781 543705 memory.go:184] no items to output this cycle
I0319 20:56:03.409787 543705 cpu.go:275] no items to output this cycle
E0319 20:56:13.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:56:13.409776 543705 memory.go:191] Add success.
W0319 20:56:13.409802 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 20:56:13.409806 543705 cpu.go:282] Add success.
W0319 20:56:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:56:13.409818 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:56:13.420064 543705 net.go:648] Add success.
I0319 20:56:13.423073 543705 net.go:770] primary dev: ETH0
I0319 20:56:13.423087 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:56:13.423101 543705 net.go:698] Add success.
I0319 20:56:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:56:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:56:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0319 20:56:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:56:14.456577 543705 disk_worker.go:494] system disk:vda1
I0319 20:56:14.456608 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:56:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:56:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:56:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:56:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:56:16.472428 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:56:20.929671 543705 disk_info.go:125] begin check local disk info of client
I0319 20:56:20.932071 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:56:20.932077 543705 disk_info.go:196] parse disk info done, disk is : [0xc000395a00 0xc000395a40]
E0319 20:56:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:56:23.409896 543705 memory.go:184] no items to output this cycle
I0319 20:56:23.409895 543705 cpu.go:275] no items to output this cycle
E0319 20:56:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:56:33.409815 543705 memory.go:184] no items to output this cycle
I0319 20:56:33.409829 543705 cpu.go:275] no items to output this cycle
E0319 20:56:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:56:43.409787 543705 memory.go:191] Add success.
I0319 20:56:43.409807 543705 cpu.go:282] Add success.
I0319 20:56:43.420044 543705 net.go:648] Add success.
I0319 20:56:43.422956 543705 net.go:770] primary dev: ETH0
I0319 20:56:43.422970 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:56:43.422982 543705 net.go:698] Add success.
I0319 20:56:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:56:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:56:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:56:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:56:53.409777 543705 memory.go:184] no items to output this cycle
I0319 20:56:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 20:57:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:57:03.409787 543705 memory.go:184] no items to output this cycle
I0319 20:57:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 20:57:13.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:57:13.409769 543705 memory.go:191] Add success.
W0319 20:57:13.409795 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:57:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:57:13.409809 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:57:13.409814 543705 cpu.go:282] Add success.
I0319 20:57:13.420112 543705 net.go:648] Add success.
I0319 20:57:13.429535 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 20:57:13.429618 543705 net.go:770] primary dev: ETH0
I0319 20:57:13.429630 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:57:13.429641 543705 net.go:698] Add success.
I0319 20:57:13.453219 543705 event_worker.go:152] Polling the log file for events...
I0319 20:57:13.703075 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"340aee55-9f52-4488-a4e4-31051f91c045","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 20:57:13.703110 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 20:57:14.454856 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:57:14.454936 543705 disk_worker.go:708] disk space is not compliant
W0319 20:57:14.454940 543705 disk_worker.go:728] disk inode is not compliant
E0319 20:57:14.455675 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 20:57:14.455684 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 20:57:14.455690 543705 custom_config.go:64] query custom config with name: gpu
I0319 20:57:14.456480 543705 disk_worker.go:494] system disk:vda1
I0319 20:57:14.456509 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 20:57:15.456817 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 20:57:15.456828 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:57:16.457930 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 20:57:16.457930 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 20:57:16.457985 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:57:16.458006 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:57:16.472350 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:57:20.933677 543705 disk_info.go:125] begin check local disk info of client
I0319 20:57:20.936070 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:57:20.936077 543705 disk_info.go:196] parse disk info done, disk is : [0xc000368000 0xc000368040]
E0319 20:57:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:57:23.409786 543705 memory.go:184] no items to output this cycle
I0319 20:57:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 20:57:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:57:33.409783 543705 cpu.go:275] no items to output this cycle
I0319 20:57:33.409796 543705 memory.go:184] no items to output this cycle
I0319 20:57:37.885731 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 20:57:37.885738 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 20:57:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:57:43.410638 543705 memory.go:191] Add success.
I0319 20:57:43.409801 543705 cpu.go:282] Add success.
I0319 20:57:43.420338 543705 net.go:648] Add success.
I0319 20:57:43.422968 543705 net.go:770] primary dev: ETH0
I0319 20:57:43.422981 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:57:43.422994 543705 net.go:698] Add success.
I0319 20:57:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:57:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:57:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:57:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:57:53.409786 543705 memory.go:184] no items to output this cycle
I0319 20:57:53.409788 543705 cpu.go:275] no items to output this cycle
E0319 20:58:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:58:03.409783 543705 memory.go:184] no items to output this cycle
I0319 20:58:03.409799 543705 cpu.go:275] no items to output this cycle
E0319 20:58:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:58:13.409820 543705 memory.go:191] Add success.
I0319 20:58:13.409830 543705 cpu.go:282] Add success.
W0319 20:58:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:58:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:58:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:58:13.420121 543705 net.go:648] Add success.
I0319 20:58:13.423151 543705 net.go:770] primary dev: ETH0
I0319 20:58:13.423166 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:58:13.423181 543705 net.go:698] Add success.
I0319 20:58:14.454981 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:58:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:58:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0319 20:58:14.455173 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:58:14.456497 543705 disk_worker.go:494] system disk:vda1
I0319 20:58:14.456541 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:58:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:58:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:58:16.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:58:16.458081 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:58:16.472401 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:58:20.937681 543705 disk_info.go:125] begin check local disk info of client
I0319 20:58:20.940165 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:58:20.940172 543705 disk_info.go:196] parse disk info done, disk is : [0xc000576300 0xc000576340]
E0319 20:58:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:58:23.409800 543705 memory.go:184] no items to output this cycle
I0319 20:58:23.409814 543705 cpu.go:275] no items to output this cycle
E0319 20:58:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:58:33.409792 543705 memory.go:184] no items to output this cycle
I0319 20:58:33.409804 543705 cpu.go:275] no items to output this cycle
E0319 20:58:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:58:43.409794 543705 cpu.go:282] Add success.
I0319 20:58:43.409807 543705 memory.go:191] Add success.
I0319 20:58:43.419908 543705 net.go:648] Add success.
I0319 20:58:43.422771 543705 net.go:770] primary dev: ETH0
I0319 20:58:43.422784 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:58:43.422797 543705 net.go:698] Add success.
I0319 20:58:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:58:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:58:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:58:53.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:58:53.409810 543705 memory.go:184] no items to output this cycle
I0319 20:58:53.409823 543705 cpu.go:275] no items to output this cycle
E0319 20:59:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:59:03.409796 543705 memory.go:184] no items to output this cycle
I0319 20:59:03.409801 543705 cpu.go:275] no items to output this cycle
E0319 20:59:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:59:13.409800 543705 cpu.go:282] Add success.
I0319 20:59:13.409806 543705 memory.go:191] Add success.
W0319 20:59:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 20:59:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 20:59:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 20:59:13.420044 543705 net.go:648] Add success.
I0319 20:59:13.422877 543705 net.go:770] primary dev: ETH0
I0319 20:59:13.422890 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:59:13.422902 543705 net.go:698] Add success.
I0319 20:59:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 20:59:14.455129 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 20:59:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0319 20:59:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0319 20:59:14.456586 543705 disk_worker.go:494] system disk:vda1
I0319 20:59:14.456617 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 20:59:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 20:59:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:59:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:59:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 20:59:16.472468 543705 disk_local_worker.go:436] Get disk info: []
I0319 20:59:20.941676 543705 disk_info.go:125] begin check local disk info of client
I0319 20:59:20.944100 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 20:59:20.944106 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004b4080 0xc0004b40c0]
E0319 20:59:23.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:59:23.409770 543705 memory.go:184] no items to output this cycle
I0319 20:59:23.409816 543705 cpu.go:275] no items to output this cycle
E0319 20:59:33.409803 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:59:33.409823 543705 memory.go:184] no items to output this cycle
I0319 20:59:33.409834 543705 cpu.go:275] no items to output this cycle
E0319 20:59:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:59:43.409782 543705 memory.go:191] Add success.
I0319 20:59:43.409820 543705 cpu.go:282] Add success.
I0319 20:59:43.419954 543705 net.go:648] Add success.
I0319 20:59:43.422452 543705 net.go:770] primary dev: ETH0
I0319 20:59:43.422466 543705 net.go:802] Send network stats successfully!,count is 6
I0319 20:59:43.422480 543705 net.go:698] Add success.
I0319 20:59:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 20:59:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 20:59:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0319 20:59:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 20:59:53.409781 543705 cpu.go:275] no items to output this cycle
I0319 20:59:53.409784 543705 memory.go:184] no items to output this cycle
E0319 21:00:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:00:03.409779 543705 memory.go:184] no items to output this cycle
I0319 21:00:03.409790 543705 cpu.go:275] no items to output this cycle
E0319 21:00:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:00:13.409781 543705 memory.go:191] Add success.
I0319 21:00:13.409806 543705 cpu.go:282] Add success.
W0319 21:00:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:00:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:00:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:00:13.420060 543705 net.go:648] Add success.
I0319 21:00:13.422730 543705 net.go:770] primary dev: ETH0
I0319 21:00:13.422743 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:00:13.422754 543705 net.go:698] Add success.
I0319 21:00:13.469109 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"094f6fda-7d31-4611-97c9-43f1367616f1","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 21:00:13.469152 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 21:00:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:00:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:00:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0319 21:00:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:00:14.456665 543705 disk_worker.go:494] system disk:vda1
I0319 21:00:14.456696 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:00:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:00:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:00:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:00:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:00:16.472405 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:00:20.945679 543705 disk_info.go:125] begin check local disk info of client
I0319 21:00:20.948192 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:00:20.948198 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003924c0 0xc000392500]
E0319 21:00:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:00:23.409792 543705 memory.go:184] no items to output this cycle
I0319 21:00:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 21:00:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:00:33.409810 543705 memory.go:184] no items to output this cycle
I0319 21:00:33.409820 543705 cpu.go:275] no items to output this cycle
I0319 21:00:37.888128 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 21:00:37.888135 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 21:00:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:00:43.410621 543705 memory.go:191] Add success.
I0319 21:00:43.409805 543705 cpu.go:282] Add success.
I0319 21:00:43.420311 543705 net.go:648] Add success.
I0319 21:00:43.423086 543705 net.go:770] primary dev: ETH0
I0319 21:00:43.423100 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:00:43.423113 543705 net.go:698] Add success.
I0319 21:00:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:00:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:00:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:00:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:00:53.409774 543705 memory.go:184] no items to output this cycle
I0319 21:00:53.409792 543705 cpu.go:275] no items to output this cycle
E0319 21:01:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:01:03.409797 543705 memory.go:184] no items to output this cycle
I0319 21:01:03.409812 543705 cpu.go:275] no items to output this cycle
E0319 21:01:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:01:13.409777 543705 memory.go:191] Add success.
W0319 21:01:13.409802 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 21:01:13.409807 543705 cpu.go:282] Add success.
W0319 21:01:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:01:13.409816 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:01:13.420088 543705 net.go:648] Add success.
I0319 21:01:13.423505 543705 net.go:770] primary dev: ETH0
I0319 21:01:13.423520 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:01:13.423534 543705 net.go:698] Add success.
I0319 21:01:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:01:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:01:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 21:01:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:01:14.456573 543705 disk_worker.go:494] system disk:vda1
I0319 21:01:14.456606 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:01:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:01:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:01:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:01:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:01:16.472427 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:01:20.949665 543705 disk_info.go:125] begin check local disk info of client
I0319 21:01:20.952073 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:01:20.952080 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a5880 0xc0002a58c0]
E0319 21:01:23.409878 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:01:23.409900 543705 memory.go:184] no items to output this cycle
I0319 21:01:23.409960 543705 cpu.go:275] no items to output this cycle
E0319 21:01:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:01:33.409789 543705 cpu.go:275] no items to output this cycle
I0319 21:01:33.409796 543705 memory.go:184] no items to output this cycle
E0319 21:01:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:01:43.409789 543705 memory.go:191] Add success.
I0319 21:01:43.409803 543705 cpu.go:282] Add success.
I0319 21:01:43.419949 543705 net.go:648] Add success.
I0319 21:01:43.422864 543705 net.go:770] primary dev: ETH0
I0319 21:01:43.422878 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:01:43.422893 543705 net.go:698] Add success.
I0319 21:01:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:01:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:01:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:01:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:01:53.409787 543705 memory.go:184] no items to output this cycle
I0319 21:01:53.409792 543705 cpu.go:275] no items to output this cycle
E0319 21:02:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:02:03.409779 543705 memory.go:184] no items to output this cycle
I0319 21:02:03.409802 543705 cpu.go:275] no items to output this cycle
E0319 21:02:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:02:13.409784 543705 memory.go:191] Add success.
I0319 21:02:13.409786 543705 cpu.go:282] Add success.
W0319 21:02:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:02:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:02:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:02:13.420065 543705 net.go:648] Add success.
I0319 21:02:13.422869 543705 net.go:770] primary dev: ETH0
I0319 21:02:13.422883 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:02:13.422896 543705 net.go:698] Add success.
W0319 21:02:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:02:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0319 21:02:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:02:14.456595 543705 disk_worker.go:494] system disk:vda1
I0319 21:02:14.456622 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 21:02:14.456947 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 21:02:14.456957 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 21:02:14.456963 543705 custom_config.go:64] query custom config with name: gpu
E0319 21:02:15.456824 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 21:02:15.456835 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:02:16.457914 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 21:02:16.457914 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 21:02:16.457968 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:02:16.457987 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:02:16.472327 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:02:20.953678 543705 disk_info.go:125] begin check local disk info of client
I0319 21:02:20.956258 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:02:20.956264 543705 disk_info.go:196] parse disk info done, disk is : [0xc00029f280 0xc00029f2c0]
E0319 21:02:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:02:23.409763 543705 memory.go:184] no items to output this cycle
I0319 21:02:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 21:02:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:02:33.409795 543705 memory.go:184] no items to output this cycle
I0319 21:02:33.409803 543705 cpu.go:275] no items to output this cycle
E0319 21:02:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:02:43.409785 543705 memory.go:191] Add success.
I0319 21:02:43.409796 543705 cpu.go:282] Add success.
I0319 21:02:43.419961 543705 net.go:648] Add success.
I0319 21:02:43.422777 543705 net.go:770] primary dev: ETH0
I0319 21:02:43.422790 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:02:43.422802 543705 net.go:698] Add success.
I0319 21:02:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:02:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:02:46.458092 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:02:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:02:53.409798 543705 memory.go:184] no items to output this cycle
I0319 21:02:53.409810 543705 cpu.go:275] no items to output this cycle
E0319 21:03:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:03:03.409787 543705 memory.go:184] no items to output this cycle
I0319 21:03:03.409790 543705 cpu.go:275] no items to output this cycle
E0319 21:03:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:03:13.409785 543705 memory.go:191] Add success.
I0319 21:03:13.409801 543705 cpu.go:282] Add success.
W0319 21:03:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:03:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:03:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:03:13.420210 543705 net.go:648] Add success.
I0319 21:03:13.422989 543705 net.go:770] primary dev: ETH0
I0319 21:03:13.423003 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:03:13.423018 543705 net.go:698] Add success.
I0319 21:03:13.838093 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1dc7bee4-b268-45a5-b90e-aecf1b4bb522","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 21:03:13.838139 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 21:03:14.454694 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:03:14.454897 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:03:14.454907 543705 disk_worker.go:708] disk space is not compliant
W0319 21:03:14.454910 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:03:14.456249 543705 disk_worker.go:494] system disk:vda1
I0319 21:03:14.456292 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:03:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:03:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:03:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:03:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:03:16.472388 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:03:20.957676 543705 disk_info.go:125] begin check local disk info of client
I0319 21:03:20.960171 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:03:20.960178 543705 disk_info.go:196] parse disk info done, disk is : [0xc000374f00 0xc000374f40]
E0319 21:03:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:03:23.409791 543705 memory.go:184] no items to output this cycle
I0319 21:03:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 21:03:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:03:33.409798 543705 memory.go:184] no items to output this cycle
I0319 21:03:33.409804 543705 cpu.go:275] no items to output this cycle
I0319 21:03:37.889730 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 21:03:37.889737 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 21:03:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:03:43.410635 543705 memory.go:191] Add success.
I0319 21:03:43.409796 543705 cpu.go:282] Add success.
I0319 21:03:43.420381 543705 net.go:648] Add success.
I0319 21:03:43.422811 543705 net.go:770] primary dev: ETH0
I0319 21:03:43.422824 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:03:43.422837 543705 net.go:698] Add success.
I0319 21:03:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:03:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:03:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:03:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:03:53.409799 543705 memory.go:184] no items to output this cycle
I0319 21:03:53.409810 543705 cpu.go:275] no items to output this cycle
E0319 21:04:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:04:03.409781 543705 memory.go:184] no items to output this cycle
I0319 21:04:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 21:04:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:04:13.409816 543705 memory.go:191] Add success.
I0319 21:04:13.409828 543705 cpu.go:282] Add success.
W0319 21:04:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:04:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:04:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:04:13.420148 543705 net.go:648] Add success.
I0319 21:04:13.422921 543705 net.go:770] primary dev: ETH0
I0319 21:04:13.422937 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:04:13.422951 543705 net.go:698] Add success.
I0319 21:04:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:04:14.455180 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:04:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0319 21:04:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:04:14.456578 543705 disk_worker.go:494] system disk:vda1
I0319 21:04:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:04:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:04:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:04:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:04:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:04:16.472401 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:04:20.961672 543705 disk_info.go:125] begin check local disk info of client
I0319 21:04:20.964118 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:04:20.964125 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b6740 0xc0002b6780]
E0319 21:04:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:04:23.409785 543705 memory.go:184] no items to output this cycle
I0319 21:04:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 21:04:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:04:33.409889 543705 memory.go:184] no items to output this cycle
I0319 21:04:33.409977 543705 cpu.go:275] no items to output this cycle
E0319 21:04:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:04:43.409796 543705 memory.go:191] Add success.
I0319 21:04:43.409806 543705 cpu.go:282] Add success.
I0319 21:04:43.419938 543705 net.go:648] Add success.
I0319 21:04:43.422685 543705 net.go:770] primary dev: ETH0
I0319 21:04:43.422697 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:04:43.422710 543705 net.go:698] Add success.
I0319 21:04:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:04:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:04:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:04:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:04:53.409795 543705 memory.go:184] no items to output this cycle
I0319 21:04:53.409806 543705 cpu.go:275] no items to output this cycle
E0319 21:05:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:05:03.409809 543705 memory.go:184] no items to output this cycle
I0319 21:05:03.409821 543705 cpu.go:275] no items to output this cycle
E0319 21:05:13.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:05:13.409769 543705 memory.go:191] Add success.
W0319 21:05:13.409794 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:05:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:05:13.409808 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:05:13.409809 543705 cpu.go:282] Add success.
I0319 21:05:13.420278 543705 net.go:648] Add success.
I0319 21:05:13.423413 543705 net.go:770] primary dev: ETH0
I0319 21:05:13.423427 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:05:13.423438 543705 net.go:698] Add success.
I0319 21:05:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:05:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:05:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0319 21:05:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:05:14.456591 543705 disk_worker.go:494] system disk:vda1
I0319 21:05:14.456621 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:05:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:05:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:05:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:05:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:05:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:05:20.965673 543705 disk_info.go:125] begin check local disk info of client
I0319 21:05:20.968090 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:05:20.968096 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000de980 0xc0000de9c0]
E0319 21:05:23.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:05:23.409760 543705 memory.go:184] no items to output this cycle
I0319 21:05:23.409792 543705 cpu.go:275] no items to output this cycle
E0319 21:05:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:05:33.409777 543705 memory.go:184] no items to output this cycle
I0319 21:05:33.409806 543705 cpu.go:275] no items to output this cycle
E0319 21:05:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:05:43.409804 543705 memory.go:191] Add success.
I0319 21:05:43.409806 543705 cpu.go:282] Add success.
I0319 21:05:43.419982 543705 net.go:648] Add success.
I0319 21:05:43.422866 543705 net.go:770] primary dev: ETH0
I0319 21:05:43.422882 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:05:43.422894 543705 net.go:698] Add success.
I0319 21:05:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:05:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:05:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:05:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:05:53.409779 543705 memory.go:184] no items to output this cycle
I0319 21:05:53.409785 543705 cpu.go:275] no items to output this cycle
E0319 21:06:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:06:03.409782 543705 memory.go:184] no items to output this cycle
I0319 21:06:03.409815 543705 cpu.go:275] no items to output this cycle
E0319 21:06:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:06:13.409776 543705 memory.go:191] Add success.
W0319 21:06:13.409802 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 21:06:13.409809 543705 cpu.go:282] Add success.
W0319 21:06:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:06:13.409816 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:06:13.420035 543705 net.go:648] Add success.
I0319 21:06:13.422762 543705 net.go:770] primary dev: ETH0
I0319 21:06:13.422775 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:06:13.422787 543705 net.go:698] Add success.
I0319 21:06:13.480875 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"26baa5b2-75b3-49c5-93bf-da8d0c5ea60b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 21:06:13.480908 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 21:06:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:06:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:06:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0319 21:06:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:06:14.456603 543705 disk_worker.go:494] system disk:vda1
I0319 21:06:14.456634 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:06:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:06:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:06:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:06:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:06:16.472566 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:06:20.969673 543705 disk_info.go:125] begin check local disk info of client
I0319 21:06:20.972095 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:06:20.972101 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003073c0 0xc000307400]
E0319 21:06:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:06:23.409790 543705 memory.go:184] no items to output this cycle
I0319 21:06:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 21:06:33.409878 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:06:33.409888 543705 cpu.go:275] no items to output this cycle
I0319 21:06:33.409896 543705 memory.go:184] no items to output this cycle
I0319 21:06:37.892162 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 21:06:37.892169 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 21:06:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:06:43.410765 543705 memory.go:191] Add success.
I0319 21:06:43.409829 543705 cpu.go:282] Add success.
I0319 21:06:43.420475 543705 net.go:648] Add success.
I0319 21:06:43.423230 543705 net.go:770] primary dev: ETH0
I0319 21:06:43.423244 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:06:43.423257 543705 net.go:698] Add success.
I0319 21:06:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:06:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:06:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:06:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:06:53.409774 543705 memory.go:184] no items to output this cycle
I0319 21:06:53.409794 543705 cpu.go:275] no items to output this cycle
E0319 21:07:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:07:03.409790 543705 memory.go:184] no items to output this cycle
I0319 21:07:03.409793 543705 cpu.go:275] no items to output this cycle
W0319 21:07:13.409708 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:07:13.409725 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:07:13.409731 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:07:13.409798 543705 cpu.go:282] Add success.
E0319 21:07:13.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:07:13.409820 543705 memory.go:191] Add success.
I0319 21:07:13.420068 543705 net.go:648] Add success.
I0319 21:07:13.423184 543705 net.go:770] primary dev: ETH0
I0319 21:07:13.423199 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:07:13.423214 543705 net.go:698] Add success.
I0319 21:07:13.452855 543705 event_worker.go:152] Polling the log file for events...
W0319 21:07:14.455165 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:07:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0319 21:07:14.455180 543705 disk_worker.go:728] disk inode is not compliant
E0319 21:07:14.455885 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 21:07:14.455893 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 21:07:14.455898 543705 custom_config.go:64] query custom config with name: gpu
I0319 21:07:14.456555 543705 disk_worker.go:494] system disk:vda1
I0319 21:07:14.456587 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 21:07:15.456875 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 21:07:15.456884 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:07:16.457939 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 21:07:16.457936 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 21:07:16.457994 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:07:16.458014 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:07:16.472348 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:07:20.973675 543705 disk_info.go:125] begin check local disk info of client
I0319 21:07:20.976051 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:07:20.976057 543705 disk_info.go:196] parse disk info done, disk is : [0xc000216ac0 0xc000216b00]
E0319 21:07:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:07:23.409785 543705 memory.go:184] no items to output this cycle
I0319 21:07:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 21:07:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:07:33.409786 543705 memory.go:184] no items to output this cycle
I0319 21:07:33.409798 543705 cpu.go:275] no items to output this cycle
E0319 21:07:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:07:43.409828 543705 memory.go:191] Add success.
I0319 21:07:43.409835 543705 cpu.go:282] Add success.
I0319 21:07:43.419964 543705 net.go:648] Add success.
I0319 21:07:43.423046 543705 net.go:770] primary dev: ETH0
I0319 21:07:43.423059 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:07:43.423072 543705 net.go:698] Add success.
I0319 21:07:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:07:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:07:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:07:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:07:53.409801 543705 memory.go:184] no items to output this cycle
I0319 21:07:53.409812 543705 cpu.go:275] no items to output this cycle
E0319 21:08:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:08:03.409783 543705 memory.go:184] no items to output this cycle
I0319 21:08:03.409790 543705 cpu.go:275] no items to output this cycle
W0319 21:08:13.409709 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:08:13.409724 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:08:13.409729 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0319 21:08:13.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:08:13.409817 543705 memory.go:191] Add success.
I0319 21:08:13.409817 543705 cpu.go:282] Add success.
I0319 21:08:13.420149 543705 net.go:648] Add success.
I0319 21:08:13.423104 543705 net.go:770] primary dev: ETH0
I0319 21:08:13.423117 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:08:13.423128 543705 net.go:698] Add success.
I0319 21:08:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:08:14.455098 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:08:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0319 21:08:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:08:14.456514 543705 disk_worker.go:494] system disk:vda1
I0319 21:08:14.456566 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:08:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:08:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:08:16.458024 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:08:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:08:16.472375 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:08:20.977673 543705 disk_info.go:125] begin check local disk info of client
I0319 21:08:20.980149 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:08:20.980155 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa800 0xc0001aa840]
E0319 21:08:23.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:08:23.409822 543705 memory.go:184] no items to output this cycle
I0319 21:08:23.409826 543705 cpu.go:275] no items to output this cycle
E0319 21:08:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:08:33.409780 543705 memory.go:184] no items to output this cycle
I0319 21:08:33.409816 543705 cpu.go:275] no items to output this cycle
E0319 21:08:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:08:43.409798 543705 memory.go:191] Add success.
I0319 21:08:43.409802 543705 cpu.go:282] Add success.
I0319 21:08:43.420243 543705 net.go:648] Add success.
I0319 21:08:43.422813 543705 net.go:770] primary dev: ETH0
I0319 21:08:43.422826 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:08:43.422838 543705 net.go:698] Add success.
I0319 21:08:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:08:46.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:08:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:08:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:08:53.409786 543705 memory.go:184] no items to output this cycle
I0319 21:08:53.409787 543705 cpu.go:275] no items to output this cycle
E0319 21:09:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:09:03.409780 543705 memory.go:184] no items to output this cycle
I0319 21:09:03.409789 543705 cpu.go:275] no items to output this cycle
E0319 21:09:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:09:13.409781 543705 memory.go:191] Add success.
I0319 21:09:13.409803 543705 cpu.go:282] Add success.
W0319 21:09:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:09:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:09:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:09:13.420000 543705 net.go:770] primary dev: ETH0
I0319 21:09:13.420012 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:09:13.420025 543705 net.go:698] Add success.
I0319 21:09:13.420264 543705 net.go:648] Add success.
I0319 21:09:13.562102 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4f82594a-5bb8-49bf-9512-cbad26f7fc30","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 21:09:13.562140 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 21:09:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:09:14.455185 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:09:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0319 21:09:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:09:14.456628 543705 disk_worker.go:494] system disk:vda1
I0319 21:09:14.456658 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:09:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:09:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:09:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:09:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:09:16.472367 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:09:20.981673 543705 disk_info.go:125] begin check local disk info of client
I0319 21:09:20.984140 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:09:20.984146 543705 disk_info.go:196] parse disk info done, disk is : [0xc000343a80 0xc000343ac0]
E0319 21:09:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:09:23.409773 543705 memory.go:184] no items to output this cycle
I0319 21:09:23.409795 543705 cpu.go:275] no items to output this cycle
E0319 21:09:33.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:09:33.409813 543705 memory.go:184] no items to output this cycle
I0319 21:09:33.409822 543705 cpu.go:275] no items to output this cycle
I0319 21:09:37.893726 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 21:09:37.893734 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 21:09:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:09:43.410758 543705 memory.go:191] Add success.
I0319 21:09:43.409828 543705 cpu.go:282] Add success.
I0319 21:09:43.419717 543705 net.go:648] Add success.
I0319 21:09:43.422680 543705 net.go:770] primary dev: ETH0
I0319 21:09:43.422694 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:09:43.422706 543705 net.go:698] Add success.
I0319 21:09:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:09:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:09:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:09:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:09:53.409802 543705 memory.go:184] no items to output this cycle
I0319 21:09:53.409814 543705 cpu.go:275] no items to output this cycle
E0319 21:10:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:10:03.409806 543705 memory.go:184] no items to output this cycle
I0319 21:10:03.409820 543705 cpu.go:275] no items to output this cycle
W0319 21:10:13.409710 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:10:13.409733 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:10:13.409739 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0319 21:10:13.409832 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:10:13.409837 543705 cpu.go:282] Add success.
I0319 21:10:13.409851 543705 memory.go:191] Add success.
I0319 21:10:13.419983 543705 net.go:648] Add success.
I0319 21:10:13.422983 543705 net.go:770] primary dev: ETH0
I0319 21:10:13.422996 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:10:13.423011 543705 net.go:698] Add success.
I0319 21:10:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:10:14.455197 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:10:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0319 21:10:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:10:14.456571 543705 disk_worker.go:494] system disk:vda1
I0319 21:10:14.456603 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:10:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:10:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:10:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:10:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:10:16.472367 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:10:20.985672 543705 disk_info.go:125] begin check local disk info of client
I0319 21:10:20.988123 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:10:20.988129 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004625c0 0xc000462600]
E0319 21:10:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:10:23.409766 543705 memory.go:184] no items to output this cycle
I0319 21:10:23.409781 543705 cpu.go:275] no items to output this cycle
E0319 21:10:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:10:33.409811 543705 memory.go:184] no items to output this cycle
I0319 21:10:33.409825 543705 cpu.go:275] no items to output this cycle
E0319 21:10:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:10:43.409781 543705 memory.go:191] Add success.
I0319 21:10:43.409815 543705 cpu.go:282] Add success.
I0319 21:10:43.420122 543705 net.go:648] Add success.
I0319 21:10:43.423278 543705 net.go:770] primary dev: ETH0
I0319 21:10:43.423291 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:10:43.423303 543705 net.go:698] Add success.
I0319 21:10:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:10:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:10:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:10:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:10:53.409775 543705 memory.go:184] no items to output this cycle
I0319 21:10:53.409784 543705 cpu.go:275] no items to output this cycle
E0319 21:11:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:11:03.409787 543705 memory.go:184] no items to output this cycle
I0319 21:11:03.409793 543705 cpu.go:275] no items to output this cycle
W0319 21:11:13.409706 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:11:13.409721 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:11:13.409726 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0319 21:11:13.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:11:13.409814 543705 memory.go:191] Add success.
I0319 21:11:13.409824 543705 cpu.go:282] Add success.
I0319 21:11:13.420052 543705 net.go:648] Add success.
I0319 21:11:13.423647 543705 net.go:770] primary dev: ETH0
I0319 21:11:13.423659 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:11:13.423670 543705 net.go:698] Add success.
I0319 21:11:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:11:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:11:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0319 21:11:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:11:14.456608 543705 disk_worker.go:494] system disk:vda1
I0319 21:11:14.456636 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:11:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:11:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:11:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:11:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:11:16.472407 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:11:20.989671 543705 disk_info.go:125] begin check local disk info of client
I0319 21:11:20.992166 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:11:20.992173 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001f7280 0xc0001f72c0]
E0319 21:11:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:11:23.409766 543705 memory.go:184] no items to output this cycle
I0319 21:11:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 21:11:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:11:33.409781 543705 memory.go:184] no items to output this cycle
I0319 21:11:33.409796 543705 cpu.go:275] no items to output this cycle
E0319 21:11:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:11:43.409799 543705 memory.go:191] Add success.
I0319 21:11:43.409800 543705 cpu.go:282] Add success.
I0319 21:11:43.419967 543705 net.go:648] Add success.
I0319 21:11:43.422901 543705 net.go:770] primary dev: ETH0
I0319 21:11:43.422914 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:11:43.422926 543705 net.go:698] Add success.
I0319 21:11:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:11:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:11:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:11:53.409866 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:11:53.409913 543705 memory.go:184] no items to output this cycle
I0319 21:11:53.410105 543705 cpu.go:275] no items to output this cycle
E0319 21:12:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:12:03.409808 543705 memory.go:184] no items to output this cycle
I0319 21:12:03.409827 543705 cpu.go:275] no items to output this cycle
E0319 21:12:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:12:13.409790 543705 memory.go:191] Add success.
I0319 21:12:13.409794 543705 cpu.go:282] Add success.
W0319 21:12:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:12:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:12:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:12:13.420057 543705 net.go:648] Add success.
I0319 21:12:13.422791 543705 net.go:770] primary dev: ETH0
I0319 21:12:13.422806 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:12:13.422819 543705 net.go:698] Add success.
I0319 21:12:13.858134 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"77b9025e-b7b5-403e-8831-8737b099651f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 21:12:13.858169 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 21:12:14.454856 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:12:14.454922 543705 disk_worker.go:708] disk space is not compliant
W0319 21:12:14.454924 543705 disk_worker.go:728] disk inode is not compliant
E0319 21:12:14.455915 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 21:12:14.455924 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 21:12:14.455931 543705 custom_config.go:64] query custom config with name: gpu
I0319 21:12:14.456296 543705 disk_worker.go:494] system disk:vda1
I0319 21:12:14.456324 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 21:12:15.456847 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 21:12:15.456856 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:12:16.457958 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 21:12:16.457958 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 21:12:16.458013 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:12:16.458032 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:12:16.472363 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:12:20.993673 543705 disk_info.go:125] begin check local disk info of client
I0319 21:12:20.996049 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:12:20.996055 543705 disk_info.go:196] parse disk info done, disk is : [0xc000367800 0xc000367840]
E0319 21:12:23.409741 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:12:23.409756 543705 memory.go:184] no items to output this cycle
I0319 21:12:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 21:12:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:12:33.409781 543705 memory.go:184] no items to output this cycle
I0319 21:12:33.409813 543705 cpu.go:275] no items to output this cycle
I0319 21:12:37.893875 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 21:12:37.893882 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 21:12:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:12:43.410746 543705 memory.go:191] Add success.
I0319 21:12:43.409825 543705 cpu.go:282] Add success.
I0319 21:12:43.420507 543705 net.go:648] Add success.
I0319 21:12:43.423274 543705 net.go:770] primary dev: ETH0
I0319 21:12:43.423292 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:12:43.423308 543705 net.go:698] Add success.
I0319 21:12:46.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:12:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:12:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:12:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:12:53.409787 543705 memory.go:184] no items to output this cycle
I0319 21:12:53.409787 543705 cpu.go:275] no items to output this cycle
E0319 21:13:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:13:03.409792 543705 memory.go:184] no items to output this cycle
I0319 21:13:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 21:13:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:13:13.409795 543705 memory.go:191] Add success.
I0319 21:13:13.409812 543705 cpu.go:282] Add success.
W0319 21:13:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:13:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:13:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:13:13.420273 543705 net.go:648] Add success.
I0319 21:13:13.422954 543705 net.go:770] primary dev: ETH0
I0319 21:13:13.422966 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:13:13.422978 543705 net.go:698] Add success.
I0319 21:13:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:13:14.455104 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:13:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0319 21:13:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:13:14.456490 543705 disk_worker.go:494] system disk:vda1
I0319 21:13:14.456533 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:13:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:13:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:13:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:13:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:13:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:13:20.997674 543705 disk_info.go:125] begin check local disk info of client
I0319 21:13:21.000229 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:13:21.000237 543705 disk_info.go:196] parse disk info done, disk is : [0xc000391300 0xc000391340]
E0319 21:13:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:13:23.409781 543705 memory.go:184] no items to output this cycle
I0319 21:13:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 21:13:33.409811 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:13:33.409817 543705 cpu.go:275] no items to output this cycle
I0319 21:13:33.409831 543705 memory.go:184] no items to output this cycle
E0319 21:13:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:13:43.409810 543705 memory.go:191] Add success.
I0319 21:13:43.409810 543705 cpu.go:282] Add success.
I0319 21:13:43.420440 543705 net.go:648] Add success.
I0319 21:13:43.423513 543705 net.go:770] primary dev: ETH0
I0319 21:13:43.423529 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:13:43.423542 543705 net.go:698] Add success.
I0319 21:13:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:13:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:13:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:13:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:13:53.409790 543705 memory.go:184] no items to output this cycle
I0319 21:13:53.409794 543705 cpu.go:275] no items to output this cycle
E0319 21:14:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:14:03.409796 543705 memory.go:184] no items to output this cycle
I0319 21:14:03.409797 543705 cpu.go:275] no items to output this cycle
E0319 21:14:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:14:13.409804 543705 memory.go:191] Add success.
I0319 21:14:13.409815 543705 cpu.go:282] Add success.
W0319 21:14:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:14:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:14:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:14:13.420161 543705 net.go:648] Add success.
I0319 21:14:13.422920 543705 net.go:770] primary dev: ETH0
I0319 21:14:13.422932 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:14:13.422944 543705 net.go:698] Add success.
I0319 21:14:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:14:14.455152 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:14:14.455163 543705 disk_worker.go:708] disk space is not compliant
W0319 21:14:14.455166 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:14:14.456494 543705 disk_worker.go:494] system disk:vda1
I0319 21:14:14.456539 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:14:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:14:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:14:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:14:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:14:16.472405 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:14:21.001672 543705 disk_info.go:125] begin check local disk info of client
I0319 21:14:21.004104 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:14:21.004111 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004746c0 0xc000474700]
E0319 21:14:23.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:14:23.409762 543705 memory.go:184] no items to output this cycle
I0319 21:14:23.409822 543705 cpu.go:275] no items to output this cycle
E0319 21:14:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:14:33.409779 543705 memory.go:184] no items to output this cycle
I0319 21:14:33.409819 543705 cpu.go:275] no items to output this cycle
E0319 21:14:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:14:43.409923 543705 memory.go:191] Add success.
I0319 21:14:43.409943 543705 cpu.go:282] Add success.
I0319 21:14:43.419752 543705 net.go:648] Add success.
I0319 21:14:43.422832 543705 net.go:770] primary dev: ETH0
I0319 21:14:43.422848 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:14:43.422861 543705 net.go:698] Add success.
I0319 21:14:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:14:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:14:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:14:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:14:53.409797 543705 memory.go:184] no items to output this cycle
I0319 21:14:53.409809 543705 cpu.go:275] no items to output this cycle
E0319 21:15:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:15:03.409783 543705 memory.go:184] no items to output this cycle
I0319 21:15:03.409789 543705 cpu.go:275] no items to output this cycle
E0319 21:15:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:15:13.409779 543705 memory.go:191] Add success.
I0319 21:15:13.409804 543705 cpu.go:282] Add success.
W0319 21:15:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:15:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:15:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:15:13.420073 543705 net.go:648] Add success.
I0319 21:15:13.422904 543705 net.go:770] primary dev: ETH0
I0319 21:15:13.422919 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:15:13.422934 543705 net.go:698] Add success.
I0319 21:15:13.721883 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"fbc7c6c4-7fc9-49cf-9eed-8c44c0206840","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 21:15:13.721917 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 21:15:14.454680 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:15:14.454816 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:15:14.454888 543705 disk_worker.go:708] disk space is not compliant
W0319 21:15:14.454891 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:15:14.456233 543705 disk_worker.go:494] system disk:vda1
I0319 21:15:14.456287 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:15:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:15:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:15:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:15:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:15:16.472476 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:15:21.005676 543705 disk_info.go:125] begin check local disk info of client
I0319 21:15:21.008072 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:15:21.008078 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001abec0 0xc0001abf00]
E0319 21:15:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:15:23.409793 543705 memory.go:184] no items to output this cycle
I0319 21:15:23.409808 543705 cpu.go:275] no items to output this cycle
E0319 21:15:33.409852 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:15:33.409873 543705 memory.go:184] no items to output this cycle
I0319 21:15:33.409947 543705 cpu.go:275] no items to output this cycle
I0319 21:15:37.896178 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 21:15:37.896184 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 21:15:43.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:15:43.410667 543705 memory.go:191] Add success.
I0319 21:15:43.409810 543705 cpu.go:282] Add success.
I0319 21:15:43.420392 543705 net.go:648] Add success.
I0319 21:15:43.423030 543705 net.go:770] primary dev: ETH0
I0319 21:15:43.423046 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:15:43.423061 543705 net.go:698] Add success.
I0319 21:15:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:15:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:15:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:15:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:15:53.409786 543705 memory.go:184] no items to output this cycle
I0319 21:15:53.409789 543705 cpu.go:275] no items to output this cycle
E0319 21:16:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:16:03.409804 543705 memory.go:184] no items to output this cycle
I0319 21:16:03.409820 543705 cpu.go:275] no items to output this cycle
E0319 21:16:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:16:13.409780 543705 memory.go:191] Add success.
I0319 21:16:13.409804 543705 cpu.go:282] Add success.
W0319 21:16:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:16:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:16:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:16:13.420090 543705 net.go:648] Add success.
I0319 21:16:13.422956 543705 net.go:770] primary dev: ETH0
I0319 21:16:13.422971 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:16:13.422984 543705 net.go:698] Add success.
I0319 21:16:14.454986 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:16:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:16:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0319 21:16:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:16:14.456572 543705 disk_worker.go:494] system disk:vda1
I0319 21:16:14.456600 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:16:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:16:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:16:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:16:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:16:16.472362 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:16:21.009676 543705 disk_info.go:125] begin check local disk info of client
I0319 21:16:21.012154 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:16:21.012160 543705 disk_info.go:196] parse disk info done, disk is : [0xc000466140 0xc000466180]
E0319 21:16:23.409849 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:16:23.409864 543705 memory.go:184] no items to output this cycle
I0319 21:16:23.409956 543705 cpu.go:275] no items to output this cycle
E0319 21:16:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:16:33.409789 543705 memory.go:184] no items to output this cycle
I0319 21:16:33.409804 543705 cpu.go:275] no items to output this cycle
E0319 21:16:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:16:43.409784 543705 memory.go:191] Add success.
I0319 21:16:43.409804 543705 cpu.go:282] Add success.
I0319 21:16:43.420016 543705 net.go:648] Add success.
I0319 21:16:43.422662 543705 net.go:770] primary dev: ETH0
I0319 21:16:43.422675 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:16:43.422688 543705 net.go:698] Add success.
I0319 21:16:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:16:46.458062 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:16:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:16:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:16:53.409799 543705 memory.go:184] no items to output this cycle
I0319 21:16:53.409808 543705 cpu.go:275] no items to output this cycle
E0319 21:17:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:17:03.409793 543705 cpu.go:275] no items to output this cycle
I0319 21:17:03.409795 543705 memory.go:184] no items to output this cycle
E0319 21:17:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:17:13.409810 543705 memory.go:191] Add success.
I0319 21:17:13.409817 543705 cpu.go:282] Add success.
W0319 21:17:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:17:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:17:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:17:13.420084 543705 net.go:648] Add success.
I0319 21:17:13.423094 543705 net.go:770] primary dev: ETH0
I0319 21:17:13.423107 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:17:13.423120 543705 net.go:698] Add success.
I0319 21:17:13.453665 543705 event_worker.go:152] Polling the log file for events...
W0319 21:17:14.455130 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:17:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0319 21:17:14.455195 543705 disk_worker.go:728] disk inode is not compliant
E0319 21:17:14.455882 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 21:17:14.455890 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 21:17:14.455896 543705 custom_config.go:64] query custom config with name: gpu
I0319 21:17:14.456551 543705 disk_worker.go:494] system disk:vda1
I0319 21:17:14.456583 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 21:17:15.456816 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 21:17:15.456824 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:17:16.457943 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 21:17:16.457943 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 21:17:16.457998 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:17:16.458019 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:17:16.472356 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:17:21.013677 543705 disk_info.go:125] begin check local disk info of client
I0319 21:17:21.016066 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:17:21.016072 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039a0c0 0xc00039a100]
E0319 21:17:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:17:23.409794 543705 memory.go:184] no items to output this cycle
I0319 21:17:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 21:17:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:17:33.409774 543705 memory.go:184] no items to output this cycle
I0319 21:17:33.409818 543705 cpu.go:275] no items to output this cycle
E0319 21:17:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:17:43.409814 543705 memory.go:191] Add success.
I0319 21:17:43.409819 543705 cpu.go:282] Add success.
I0319 21:17:43.419960 543705 net.go:648] Add success.
I0319 21:17:43.422755 543705 net.go:770] primary dev: ETH0
I0319 21:17:43.422769 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:17:43.422781 543705 net.go:698] Add success.
I0319 21:17:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:17:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:17:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:17:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:17:53.409785 543705 memory.go:184] no items to output this cycle
I0319 21:17:53.409785 543705 cpu.go:275] no items to output this cycle
E0319 21:18:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:18:03.409780 543705 memory.go:184] no items to output this cycle
I0319 21:18:03.409796 543705 cpu.go:275] no items to output this cycle
E0319 21:18:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:18:13.409812 543705 memory.go:191] Add success.
I0319 21:18:13.409819 543705 cpu.go:282] Add success.
W0319 21:18:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:18:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:18:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:18:13.420056 543705 net.go:648] Add success.
I0319 21:18:13.423114 543705 net.go:770] primary dev: ETH0
I0319 21:18:13.423127 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:18:13.423139 543705 net.go:698] Add success.
I0319 21:18:13.518906 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"73afa646-e6b9-43a3-b6d5-ca427f2be51a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 21:18:13.518941 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 21:18:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:18:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:18:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0319 21:18:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:18:14.456546 543705 disk_worker.go:494] system disk:vda1
I0319 21:18:14.456614 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:18:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:18:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:18:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:18:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:18:16.472436 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:18:21.017669 543705 disk_info.go:125] begin check local disk info of client
I0319 21:18:21.020151 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:18:21.020159 543705 disk_info.go:196] parse disk info done, disk is : [0xc000256380 0xc0002563c0]
E0319 21:18:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:18:23.409800 543705 memory.go:184] no items to output this cycle
I0319 21:18:23.409813 543705 cpu.go:275] no items to output this cycle
E0319 21:18:33.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:18:33.409812 543705 memory.go:184] no items to output this cycle
I0319 21:18:33.409826 543705 cpu.go:275] no items to output this cycle
I0319 21:18:37.897733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 21:18:37.897740 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 21:18:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:18:43.410628 543705 memory.go:191] Add success.
I0319 21:18:43.409832 543705 cpu.go:282] Add success.
I0319 21:18:43.420415 543705 net.go:648] Add success.
I0319 21:18:43.422923 543705 net.go:770] primary dev: ETH0
I0319 21:18:43.422936 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:18:43.422948 543705 net.go:698] Add success.
I0319 21:18:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:18:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:18:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:18:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:18:53.409801 543705 memory.go:184] no items to output this cycle
I0319 21:18:53.409809 543705 cpu.go:275] no items to output this cycle
E0319 21:19:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:19:03.409776 543705 memory.go:184] no items to output this cycle
I0319 21:19:03.409802 543705 cpu.go:275] no items to output this cycle
E0319 21:19:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:19:13.409786 543705 memory.go:191] Add success.
I0319 21:19:13.409802 543705 cpu.go:282] Add success.
W0319 21:19:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:19:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:19:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:19:13.420240 543705 net.go:648] Add success.
I0319 21:19:13.423552 543705 net.go:770] primary dev: ETH0
I0319 21:19:13.423568 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:19:13.423581 543705 net.go:698] Add success.
I0319 21:19:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:19:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:19:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0319 21:19:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:19:14.456590 543705 disk_worker.go:494] system disk:vda1
I0319 21:19:14.456621 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:19:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:19:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:19:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:19:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:19:16.472408 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:19:21.021667 543705 disk_info.go:125] begin check local disk info of client
I0319 21:19:21.024144 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:19:21.024151 543705 disk_info.go:196] parse disk info done, disk is : [0xc000370340 0xc000370380]
E0319 21:19:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:19:23.409759 543705 memory.go:184] no items to output this cycle
I0319 21:19:23.409794 543705 cpu.go:275] no items to output this cycle
E0319 21:19:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:19:33.409793 543705 cpu.go:275] no items to output this cycle
I0319 21:19:33.409803 543705 memory.go:184] no items to output this cycle
E0319 21:19:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:19:43.409783 543705 memory.go:191] Add success.
I0319 21:19:43.409788 543705 cpu.go:282] Add success.
I0319 21:19:43.419888 543705 net.go:648] Add success.
I0319 21:19:43.423097 543705 net.go:770] primary dev: ETH0
I0319 21:19:43.423111 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:19:43.423123 543705 net.go:698] Add success.
I0319 21:19:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:19:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:19:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:19:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:19:53.409797 543705 memory.go:184] no items to output this cycle
I0319 21:19:53.409806 543705 cpu.go:275] no items to output this cycle
E0319 21:20:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:20:03.409803 543705 memory.go:184] no items to output this cycle
I0319 21:20:03.409811 543705 cpu.go:275] no items to output this cycle
E0319 21:20:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:20:13.409813 543705 memory.go:191] Add success.
I0319 21:20:13.409825 543705 cpu.go:282] Add success.
W0319 21:20:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:20:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:20:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:20:13.420064 543705 net.go:648] Add success.
I0319 21:20:13.423331 543705 net.go:770] primary dev: ETH0
I0319 21:20:13.423344 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:20:13.423356 543705 net.go:698] Add success.
I0319 21:20:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:20:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:20:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0319 21:20:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:20:14.456573 543705 disk_worker.go:494] system disk:vda1
I0319 21:20:14.456606 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:20:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:20:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:20:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:20:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:20:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:20:21.025671 543705 disk_info.go:125] begin check local disk info of client
I0319 21:20:21.028151 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:20:21.028157 543705 disk_info.go:196] parse disk info done, disk is : [0xc000265200 0xc000265240]
E0319 21:20:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:20:23.409762 543705 memory.go:184] no items to output this cycle
I0319 21:20:23.409784 543705 cpu.go:275] no items to output this cycle
E0319 21:20:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:20:33.409799 543705 memory.go:184] no items to output this cycle
I0319 21:20:33.409801 543705 cpu.go:275] no items to output this cycle
E0319 21:20:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:20:43.409784 543705 memory.go:191] Add success.
I0319 21:20:43.409815 543705 cpu.go:282] Add success.
I0319 21:20:43.419895 543705 net.go:648] Add success.
I0319 21:20:43.422854 543705 net.go:770] primary dev: ETH0
I0319 21:20:43.422868 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:20:43.422880 543705 net.go:698] Add success.
I0319 21:20:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:20:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:20:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:20:53.410260 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:20:53.410283 543705 memory.go:184] no items to output this cycle
I0319 21:20:53.410296 543705 cpu.go:275] no items to output this cycle
E0319 21:21:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:21:03.409782 543705 memory.go:184] no items to output this cycle
I0319 21:21:03.409790 543705 cpu.go:275] no items to output this cycle
E0319 21:21:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:21:13.409789 543705 memory.go:191] Add success.
I0319 21:21:13.409812 543705 cpu.go:282] Add success.
W0319 21:21:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:21:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:21:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:21:13.420112 543705 net.go:648] Add success.
I0319 21:21:13.422821 543705 net.go:770] primary dev: ETH0
I0319 21:21:13.422836 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:21:13.422850 543705 net.go:698] Add success.
I0319 21:21:13.535811 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0c76dba3-5727-4872-9a64-6b1f6b334142","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 21:21:13.535846 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 21:21:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:21:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:21:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0319 21:21:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:21:14.456697 543705 disk_worker.go:494] system disk:vda1
I0319 21:21:14.456735 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:21:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:21:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:21:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:21:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:21:16.472420 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:21:21.029673 543705 disk_info.go:125] begin check local disk info of client
I0319 21:21:21.032081 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:21:21.032088 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002982c0 0xc000298300]
E0319 21:21:23.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:21:23.409760 543705 memory.go:184] no items to output this cycle
I0319 21:21:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 21:21:33.409879 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:21:33.409901 543705 memory.go:184] no items to output this cycle
I0319 21:21:33.409968 543705 cpu.go:275] no items to output this cycle
I0319 21:21:37.900201 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 21:21:37.900206 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 21:21:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:21:43.410715 543705 memory.go:191] Add success.
I0319 21:21:43.409811 543705 cpu.go:282] Add success.
I0319 21:21:43.420539 543705 net.go:648] Add success.
I0319 21:21:43.423145 543705 net.go:770] primary dev: ETH0
I0319 21:21:43.423156 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:21:43.423169 543705 net.go:698] Add success.
I0319 21:21:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:21:46.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:21:46.458056 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:21:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:21:53.409783 543705 memory.go:184] no items to output this cycle
I0319 21:21:53.409785 543705 cpu.go:275] no items to output this cycle
E0319 21:22:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:22:03.409784 543705 memory.go:184] no items to output this cycle
I0319 21:22:03.409811 543705 cpu.go:275] no items to output this cycle
E0319 21:22:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:22:13.409786 543705 memory.go:191] Add success.
W0319 21:22:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 21:22:13.409812 543705 cpu.go:282] Add success.
W0319 21:22:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:22:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:22:13.420048 543705 net.go:648] Add success.
I0319 21:22:13.423254 543705 net.go:770] primary dev: ETH0
I0319 21:22:13.423268 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:22:13.423279 543705 net.go:698] Add success.
W0319 21:22:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:22:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0319 21:22:14.455177 543705 disk_worker.go:728] disk inode is not compliant
E0319 21:22:14.455870 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 21:22:14.455879 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 21:22:14.455885 543705 custom_config.go:64] query custom config with name: gpu
I0319 21:22:14.456603 543705 disk_worker.go:494] system disk:vda1
I0319 21:22:14.456648 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 21:22:15.456859 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 21:22:15.456868 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:22:16.457912 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 21:22:16.457925 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 21:22:16.457963 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:22:16.457981 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:22:16.472348 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:22:21.033673 543705 disk_info.go:125] begin check local disk info of client
I0319 21:22:21.036047 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:22:21.036053 543705 disk_info.go:196] parse disk info done, disk is : [0xc000252d00 0xc000252d40]
E0319 21:22:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:22:23.409774 543705 memory.go:184] no items to output this cycle
I0319 21:22:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 21:22:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:22:33.409786 543705 memory.go:184] no items to output this cycle
I0319 21:22:33.409814 543705 cpu.go:275] no items to output this cycle
E0319 21:22:43.409899 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:22:43.409940 543705 memory.go:191] Add success.
I0319 21:22:43.409990 543705 cpu.go:282] Add success.
I0319 21:22:43.419711 543705 net.go:648] Add success.
I0319 21:22:43.422340 543705 net.go:770] primary dev: ETH0
I0319 21:22:43.422353 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:22:43.422365 543705 net.go:698] Add success.
I0319 21:22:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:22:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:22:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:22:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:22:53.409788 543705 memory.go:184] no items to output this cycle
I0319 21:22:53.409798 543705 cpu.go:275] no items to output this cycle
E0319 21:23:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:23:03.409770 543705 memory.go:184] no items to output this cycle
I0319 21:23:03.409823 543705 cpu.go:275] no items to output this cycle
E0319 21:23:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:23:13.409826 543705 memory.go:191] Add success.
I0319 21:23:13.409834 543705 cpu.go:282] Add success.
W0319 21:23:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:23:13.409875 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:23:13.409878 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:23:13.420108 543705 net.go:648] Add success.
I0319 21:23:13.422984 543705 net.go:770] primary dev: ETH0
I0319 21:23:13.422997 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:23:13.423009 543705 net.go:698] Add success.
I0319 21:23:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:23:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:23:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0319 21:23:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:23:14.456570 543705 disk_worker.go:494] system disk:vda1
I0319 21:23:14.456598 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:23:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:23:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:23:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:23:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:23:16.472404 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:23:21.037675 543705 disk_info.go:125] begin check local disk info of client
I0319 21:23:21.040106 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:23:21.040113 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000e8300 0xc0000e8340]
E0319 21:23:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:23:23.409789 543705 memory.go:184] no items to output this cycle
I0319 21:23:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 21:23:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:23:33.409805 543705 memory.go:184] no items to output this cycle
I0319 21:23:33.409820 543705 cpu.go:275] no items to output this cycle
E0319 21:23:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:23:43.409877 543705 memory.go:191] Add success.
I0319 21:23:43.409945 543705 cpu.go:282] Add success.
I0319 21:23:43.419712 543705 net.go:648] Add success.
I0319 21:23:43.422634 543705 net.go:770] primary dev: ETH0
I0319 21:23:43.422646 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:23:43.422658 543705 net.go:698] Add success.
I0319 21:23:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:23:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:23:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:23:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:23:53.409776 543705 memory.go:184] no items to output this cycle
I0319 21:23:53.409783 543705 cpu.go:275] no items to output this cycle
E0319 21:24:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:24:03.409788 543705 memory.go:184] no items to output this cycle
I0319 21:24:03.409792 543705 cpu.go:275] no items to output this cycle
E0319 21:24:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:24:13.409784 543705 memory.go:191] Add success.
I0319 21:24:13.409804 543705 cpu.go:282] Add success.
W0319 21:24:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:24:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:24:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:24:13.420285 543705 net.go:648] Add success.
I0319 21:24:13.423031 543705 net.go:770] primary dev: ETH0
I0319 21:24:13.423044 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:24:13.423057 543705 net.go:698] Add success.
I0319 21:24:13.699424 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"29a197ec-3e39-42da-ae8d-80ccbf3a5459","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 21:24:13.699457 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 21:24:14.453966 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:24:14.455196 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:24:14.455281 543705 disk_worker.go:708] disk space is not compliant
W0319 21:24:14.455284 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:24:14.456828 543705 disk_worker.go:494] system disk:vda1
I0319 21:24:14.456858 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:24:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:24:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:24:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:24:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:24:16.472392 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:24:21.041670 543705 disk_info.go:125] begin check local disk info of client
I0319 21:24:21.044055 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:24:21.044061 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001f8900 0xc0001f8940]
E0319 21:24:23.410297 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:24:23.410317 543705 memory.go:184] no items to output this cycle
I0319 21:24:23.410325 543705 cpu.go:275] no items to output this cycle
E0319 21:24:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:24:33.409791 543705 memory.go:184] no items to output this cycle
I0319 21:24:33.409792 543705 cpu.go:275] no items to output this cycle
I0319 21:24:37.901740 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 21:24:37.901747 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 21:24:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:24:43.409827 543705 cpu.go:282] Add success.
I0319 21:24:43.410800 543705 memory.go:191] Add success.
I0319 21:24:43.419695 543705 net.go:648] Add success.
I0319 21:24:43.422390 543705 net.go:770] primary dev: ETH0
I0319 21:24:43.422402 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:24:43.422414 543705 net.go:698] Add success.
I0319 21:24:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:24:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:24:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:24:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:24:53.409792 543705 memory.go:184] no items to output this cycle
I0319 21:24:53.409804 543705 cpu.go:275] no items to output this cycle
E0319 21:25:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:25:03.409784 543705 memory.go:184] no items to output this cycle
I0319 21:25:03.409789 543705 cpu.go:275] no items to output this cycle
E0319 21:25:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:25:13.409794 543705 memory.go:191] Add success.
I0319 21:25:13.409796 543705 cpu.go:282] Add success.
W0319 21:25:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:25:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:25:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:25:13.420089 543705 net.go:648] Add success.
I0319 21:25:13.422992 543705 net.go:770] primary dev: ETH0
I0319 21:25:13.423006 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:25:13.423018 543705 net.go:698] Add success.
I0319 21:25:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:25:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:25:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0319 21:25:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:25:14.456571 543705 disk_worker.go:494] system disk:vda1
I0319 21:25:14.456601 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:25:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:25:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:25:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:25:16.458046 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:25:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:25:21.045673 543705 disk_info.go:125] begin check local disk info of client
I0319 21:25:21.048126 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:25:21.048132 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001f9a80 0xc0001f9ac0]
E0319 21:25:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:25:23.409788 543705 cpu.go:275] no items to output this cycle
I0319 21:25:23.409798 543705 memory.go:184] no items to output this cycle
E0319 21:25:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:25:33.409779 543705 memory.go:184] no items to output this cycle
I0319 21:25:33.409798 543705 cpu.go:275] no items to output this cycle
E0319 21:25:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:25:43.409807 543705 memory.go:191] Add success.
I0319 21:25:43.409815 543705 cpu.go:282] Add success.
I0319 21:25:43.420118 543705 net.go:648] Add success.
I0319 21:25:43.422925 543705 net.go:770] primary dev: ETH0
I0319 21:25:43.422940 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:25:43.422954 543705 net.go:698] Add success.
I0319 21:25:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:25:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:25:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:25:53.409806 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:25:53.409827 543705 memory.go:184] no items to output this cycle
I0319 21:25:53.409833 543705 cpu.go:275] no items to output this cycle
E0319 21:26:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:26:03.409801 543705 memory.go:184] no items to output this cycle
I0319 21:26:03.409814 543705 cpu.go:275] no items to output this cycle
E0319 21:26:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:26:13.409788 543705 memory.go:191] Add success.
I0319 21:26:13.409792 543705 cpu.go:282] Add success.
W0319 21:26:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:26:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:26:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:26:13.420288 543705 net.go:648] Add success.
I0319 21:26:13.423261 543705 net.go:770] primary dev: ETH0
I0319 21:26:13.423275 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:26:13.423288 543705 net.go:698] Add success.
I0319 21:26:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:26:14.455126 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:26:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0319 21:26:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:26:14.456597 543705 disk_worker.go:494] system disk:vda1
I0319 21:26:14.456626 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:26:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:26:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:26:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:26:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:26:16.472424 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:26:21.049687 543705 disk_info.go:125] begin check local disk info of client
I0319 21:26:21.052128 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:26:21.052135 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007be80 0xc00007bec0]
E0319 21:26:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:26:23.409811 543705 memory.go:184] no items to output this cycle
I0319 21:26:23.409823 543705 cpu.go:275] no items to output this cycle
E0319 21:26:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:26:33.409790 543705 memory.go:184] no items to output this cycle
I0319 21:26:33.409797 543705 cpu.go:275] no items to output this cycle
E0319 21:26:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:26:43.409785 543705 memory.go:191] Add success.
I0319 21:26:43.409803 543705 cpu.go:282] Add success.
I0319 21:26:43.419894 543705 net.go:648] Add success.
I0319 21:26:43.422589 543705 net.go:770] primary dev: ETH0
I0319 21:26:43.422602 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:26:43.422617 543705 net.go:698] Add success.
I0319 21:26:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:26:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:26:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:26:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:26:53.409778 543705 memory.go:184] no items to output this cycle
I0319 21:26:53.409782 543705 cpu.go:275] no items to output this cycle
E0319 21:27:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:27:03.409810 543705 memory.go:184] no items to output this cycle
I0319 21:27:03.409822 543705 cpu.go:275] no items to output this cycle
E0319 21:27:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:27:13.409795 543705 memory.go:191] Add success.
I0319 21:27:13.409797 543705 cpu.go:282] Add success.
W0319 21:27:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:27:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:27:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:27:13.420030 543705 net.go:648] Add success.
I0319 21:27:13.423224 543705 net.go:770] primary dev: ETH0
I0319 21:27:13.423236 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:27:13.423248 543705 net.go:698] Add success.
I0319 21:27:13.429499 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 21:27:13.453662 543705 event_worker.go:152] Polling the log file for events...
I0319 21:27:13.469715 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2bac6259-cb1a-4946-9dbf-e36de3ca6f6a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 21:27:13.469749 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 21:27:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:27:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0319 21:27:14.455196 543705 disk_worker.go:728] disk inode is not compliant
E0319 21:27:14.455933 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 21:27:14.455942 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 21:27:14.455948 543705 custom_config.go:64] query custom config with name: gpu
I0319 21:27:14.456797 543705 disk_worker.go:494] system disk:vda1
I0319 21:27:14.456828 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 21:27:15.456822 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 21:27:15.456830 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:27:16.457947 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 21:27:16.457947 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 21:27:16.458000 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:27:16.458019 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:27:16.472347 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:27:21.053679 543705 disk_info.go:125] begin check local disk info of client
I0319 21:27:21.056112 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:27:21.056119 543705 disk_info.go:196] parse disk info done, disk is : [0xc00049cf80 0xc00049cfc0]
E0319 21:27:23.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:27:23.409761 543705 memory.go:184] no items to output this cycle
I0319 21:27:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 21:27:33.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:27:33.409814 543705 memory.go:184] no items to output this cycle
I0319 21:27:33.409823 543705 cpu.go:275] no items to output this cycle
I0319 21:27:37.901890 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 21:27:37.901897 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 21:27:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:27:43.410724 543705 memory.go:191] Add success.
I0319 21:27:43.409803 543705 cpu.go:282] Add success.
I0319 21:27:43.420547 543705 net.go:648] Add success.
I0319 21:27:43.423325 543705 net.go:770] primary dev: ETH0
I0319 21:27:43.423341 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:27:43.423355 543705 net.go:698] Add success.
I0319 21:27:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:27:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:27:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:27:53.410257 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:27:53.410274 543705 memory.go:184] no items to output this cycle
I0319 21:27:53.410278 543705 cpu.go:275] no items to output this cycle
E0319 21:28:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:28:03.409774 543705 memory.go:184] no items to output this cycle
I0319 21:28:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 21:28:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:28:13.409801 543705 memory.go:191] Add success.
I0319 21:28:13.409805 543705 cpu.go:282] Add success.
W0319 21:28:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:28:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:28:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:28:13.420054 543705 net.go:648] Add success.
I0319 21:28:13.422768 543705 net.go:770] primary dev: ETH0
I0319 21:28:13.422783 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:28:13.422797 543705 net.go:698] Add success.
I0319 21:28:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:28:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:28:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0319 21:28:14.455179 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:28:14.456520 543705 disk_worker.go:494] system disk:vda1
I0319 21:28:14.456564 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:28:15.455978 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:28:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:28:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:28:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:28:16.472400 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:28:21.057687 543705 disk_info.go:125] begin check local disk info of client
I0319 21:28:21.060179 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:28:21.060186 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001f9780 0xc0001f97c0]
E0319 21:28:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:28:23.409771 543705 memory.go:184] no items to output this cycle
I0319 21:28:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 21:28:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:28:33.409797 543705 memory.go:184] no items to output this cycle
I0319 21:28:33.409815 543705 cpu.go:275] no items to output this cycle
E0319 21:28:43.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:28:43.409830 543705 memory.go:191] Add success.
I0319 21:28:43.409831 543705 cpu.go:282] Add success.
I0319 21:28:43.419967 543705 net.go:648] Add success.
I0319 21:28:43.422859 543705 net.go:770] primary dev: ETH0
I0319 21:28:43.422871 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:28:43.422889 543705 net.go:698] Add success.
I0319 21:28:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:28:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:28:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:28:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:28:53.409793 543705 memory.go:184] no items to output this cycle
I0319 21:28:53.409797 543705 cpu.go:275] no items to output this cycle
E0319 21:29:03.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:29:03.409823 543705 memory.go:184] no items to output this cycle
I0319 21:29:03.409838 543705 cpu.go:275] no items to output this cycle
E0319 21:29:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:29:13.409799 543705 memory.go:191] Add success.
I0319 21:29:13.409814 543705 cpu.go:282] Add success.
W0319 21:29:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:29:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:29:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:29:13.420073 543705 net.go:648] Add success.
I0319 21:29:13.422968 543705 net.go:770] primary dev: ETH0
I0319 21:29:13.422983 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:29:13.423002 543705 net.go:698] Add success.
I0319 21:29:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:29:14.455180 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:29:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0319 21:29:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:29:14.456568 543705 disk_worker.go:494] system disk:vda1
I0319 21:29:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:29:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:29:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:29:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:29:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:29:16.472392 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:29:21.061677 543705 disk_info.go:125] begin check local disk info of client
I0319 21:29:21.064166 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:29:21.064172 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a8580 0xc0004a85c0]
E0319 21:29:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:29:23.409806 543705 memory.go:184] no items to output this cycle
I0319 21:29:23.409814 543705 cpu.go:275] no items to output this cycle
E0319 21:29:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:29:33.409815 543705 memory.go:184] no items to output this cycle
I0319 21:29:33.409834 543705 cpu.go:275] no items to output this cycle
E0319 21:29:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:29:43.409778 543705 memory.go:191] Add success.
I0319 21:29:43.409821 543705 cpu.go:282] Add success.
I0319 21:29:43.419865 543705 net.go:648] Add success.
I0319 21:29:43.422917 543705 net.go:770] primary dev: ETH0
I0319 21:29:43.422931 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:29:43.422945 543705 net.go:698] Add success.
I0319 21:29:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:29:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:29:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:29:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:29:53.409861 543705 memory.go:184] no items to output this cycle
I0319 21:29:53.409959 543705 cpu.go:275] no items to output this cycle
E0319 21:30:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:30:03.409790 543705 memory.go:184] no items to output this cycle
I0319 21:30:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 21:30:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:30:13.409802 543705 memory.go:191] Add success.
I0319 21:30:13.409808 543705 cpu.go:282] Add success.
W0319 21:30:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:30:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:30:13.409855 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:30:13.420276 543705 net.go:648] Add success.
I0319 21:30:13.423283 543705 net.go:770] primary dev: ETH0
I0319 21:30:13.423296 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:30:13.423308 543705 net.go:698] Add success.
I0319 21:30:13.468070 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"360dbfd3-86ba-4b29-b619-d4db5da69117","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 21:30:13.468103 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 21:30:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:30:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:30:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0319 21:30:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:30:14.456659 543705 disk_worker.go:494] system disk:vda1
I0319 21:30:14.456690 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:30:15.455616 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:30:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:30:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:30:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:30:16.472405 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:30:21.065677 543705 disk_info.go:125] begin check local disk info of client
I0319 21:30:21.068149 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:30:21.068158 543705 disk_info.go:196] parse disk info done, disk is : [0xc00049b780 0xc00049b7c0]
E0319 21:30:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:30:23.409798 543705 memory.go:184] no items to output this cycle
I0319 21:30:23.409810 543705 cpu.go:275] no items to output this cycle
E0319 21:30:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:30:33.409793 543705 memory.go:184] no items to output this cycle
I0319 21:30:33.409793 543705 cpu.go:275] no items to output this cycle
I0319 21:30:37.904217 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 21:30:37.904224 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 21:30:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:30:43.410673 543705 memory.go:191] Add success.
I0319 21:30:43.409799 543705 cpu.go:282] Add success.
I0319 21:30:43.420363 543705 net.go:648] Add success.
I0319 21:30:43.422965 543705 net.go:770] primary dev: ETH0
I0319 21:30:43.422979 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:30:43.422993 543705 net.go:698] Add success.
I0319 21:30:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:30:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:30:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:30:53.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:30:53.409899 543705 memory.go:184] no items to output this cycle
I0319 21:30:53.409918 543705 cpu.go:275] no items to output this cycle
E0319 21:31:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:31:03.409797 543705 memory.go:184] no items to output this cycle
I0319 21:31:03.409802 543705 cpu.go:275] no items to output this cycle
E0319 21:31:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:31:13.409793 543705 memory.go:191] Add success.
I0319 21:31:13.409796 543705 cpu.go:282] Add success.
W0319 21:31:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:31:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:31:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:31:13.420078 543705 net.go:648] Add success.
I0319 21:31:13.422831 543705 net.go:770] primary dev: ETH0
I0319 21:31:13.422845 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:31:13.422857 543705 net.go:698] Add success.
I0319 21:31:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:31:14.455214 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:31:14.455224 543705 disk_worker.go:708] disk space is not compliant
W0319 21:31:14.455227 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:31:14.456608 543705 disk_worker.go:494] system disk:vda1
I0319 21:31:14.456639 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:31:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:31:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:31:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:31:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:31:16.472416 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:31:21.069676 543705 disk_info.go:125] begin check local disk info of client
I0319 21:31:21.072207 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:31:21.072215 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a9ac0 0xc0004a9b00]
E0319 21:31:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:31:23.409769 543705 memory.go:184] no items to output this cycle
I0319 21:31:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 21:31:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:31:33.409792 543705 memory.go:184] no items to output this cycle
I0319 21:31:33.409797 543705 cpu.go:275] no items to output this cycle
E0319 21:31:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:31:43.409814 543705 memory.go:191] Add success.
I0319 21:31:43.409818 543705 cpu.go:282] Add success.
I0319 21:31:43.420035 543705 net.go:648] Add success.
I0319 21:31:43.422915 543705 net.go:770] primary dev: ETH0
I0319 21:31:43.422930 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:31:43.422947 543705 net.go:698] Add success.
I0319 21:31:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:31:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:31:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:31:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:31:53.409765 543705 memory.go:184] no items to output this cycle
I0319 21:31:53.409797 543705 cpu.go:275] no items to output this cycle
E0319 21:32:03.409839 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:32:03.409858 543705 memory.go:184] no items to output this cycle
I0319 21:32:03.410094 543705 cpu.go:275] no items to output this cycle
E0319 21:32:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:32:13.409794 543705 memory.go:191] Add success.
I0319 21:32:13.409797 543705 cpu.go:282] Add success.
W0319 21:32:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:32:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:32:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:32:13.420272 543705 net.go:648] Add success.
I0319 21:32:13.423162 543705 net.go:770] primary dev: ETH0
I0319 21:32:13.423174 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:32:13.423186 543705 net.go:698] Add success.
W0319 21:32:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:32:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0319 21:32:14.455170 543705 disk_worker.go:728] disk inode is not compliant
E0319 21:32:14.456894 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 21:32:14.456904 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 21:32:14.456910 543705 custom_config.go:64] query custom config with name: gpu
I0319 21:32:14.456981 543705 disk_worker.go:494] system disk:vda1
I0319 21:32:14.457011 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 21:32:15.456862 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 21:32:15.456870 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:32:16.458039 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 21:32:16.458039 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 21:32:16.458108 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:32:16.458129 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:32:16.472491 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:32:21.073673 543705 disk_info.go:125] begin check local disk info of client
I0319 21:32:21.076191 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:32:21.076199 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a9280 0xc0004a92c0]
E0319 21:32:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:32:23.409764 543705 memory.go:184] no items to output this cycle
I0319 21:32:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 21:32:33.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:32:33.409812 543705 memory.go:184] no items to output this cycle
I0319 21:32:33.409825 543705 cpu.go:275] no items to output this cycle
E0319 21:32:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:32:43.409783 543705 memory.go:191] Add success.
I0319 21:32:43.409816 543705 cpu.go:282] Add success.
I0319 21:32:43.419866 543705 net.go:648] Add success.
I0319 21:32:43.422773 543705 net.go:770] primary dev: ETH0
I0319 21:32:43.422788 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:32:43.422803 543705 net.go:698] Add success.
I0319 21:32:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:32:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:32:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:32:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:32:53.409765 543705 memory.go:184] no items to output this cycle
I0319 21:32:53.409798 543705 cpu.go:275] no items to output this cycle
E0319 21:33:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:33:03.409811 543705 memory.go:184] no items to output this cycle
I0319 21:33:03.409820 543705 cpu.go:275] no items to output this cycle
E0319 21:33:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:33:13.409801 543705 memory.go:191] Add success.
I0319 21:33:13.409803 543705 cpu.go:282] Add success.
W0319 21:33:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:33:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:33:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:33:13.420150 543705 net.go:648] Add success.
I0319 21:33:13.422842 543705 net.go:770] primary dev: ETH0
I0319 21:33:13.422855 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:33:13.422867 543705 net.go:698] Add success.
I0319 21:33:13.464172 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8e4259bf-add7-446c-a55f-3ff0437f2b0d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 21:33:13.464204 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 21:33:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:33:14.455219 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:33:14.455231 543705 disk_worker.go:708] disk space is not compliant
W0319 21:33:14.455233 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:33:14.456599 543705 disk_worker.go:494] system disk:vda1
I0319 21:33:14.456628 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:33:15.455982 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:33:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:33:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:33:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:33:16.472453 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:33:21.077679 543705 disk_info.go:125] begin check local disk info of client
I0319 21:33:21.080200 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:33:21.080207 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007bd00 0xc00007bd80]
E0319 21:33:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:33:23.409795 543705 memory.go:184] no items to output this cycle
I0319 21:33:23.409808 543705 cpu.go:275] no items to output this cycle
E0319 21:33:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:33:33.409797 543705 memory.go:184] no items to output this cycle
I0319 21:33:33.409799 543705 cpu.go:275] no items to output this cycle
I0319 21:33:37.905736 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 21:33:37.905743 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 21:33:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:33:43.410739 543705 memory.go:191] Add success.
I0319 21:33:43.409801 543705 cpu.go:282] Add success.
I0319 21:33:43.420452 543705 net.go:648] Add success.
I0319 21:33:43.423163 543705 net.go:770] primary dev: ETH0
I0319 21:33:43.423175 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:33:43.423189 543705 net.go:698] Add success.
I0319 21:33:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:33:46.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:33:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:33:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:33:53.409780 543705 memory.go:184] no items to output this cycle
I0319 21:33:53.409784 543705 cpu.go:275] no items to output this cycle
E0319 21:34:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:34:03.409779 543705 memory.go:184] no items to output this cycle
I0319 21:34:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 21:34:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:34:13.409815 543705 memory.go:191] Add success.
I0319 21:34:13.409824 543705 cpu.go:282] Add success.
W0319 21:34:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:34:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:34:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:34:13.420139 543705 net.go:648] Add success.
I0319 21:34:13.422991 543705 net.go:770] primary dev: ETH0
I0319 21:34:13.423003 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:34:13.423015 543705 net.go:698] Add success.
I0319 21:34:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:34:14.455180 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:34:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0319 21:34:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:34:14.456612 543705 disk_worker.go:494] system disk:vda1
I0319 21:34:14.456646 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:34:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:34:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:34:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:34:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:34:16.472401 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:34:21.081676 543705 disk_info.go:125] begin check local disk info of client
I0319 21:34:21.084298 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:34:21.084304 543705 disk_info.go:196] parse disk info done, disk is : [0xc00060a500 0xc00060a540]
E0319 21:34:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:34:23.409766 543705 memory.go:184] no items to output this cycle
I0319 21:34:23.409793 543705 cpu.go:275] no items to output this cycle
E0319 21:34:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:34:33.409783 543705 memory.go:184] no items to output this cycle
I0319 21:34:33.409815 543705 cpu.go:275] no items to output this cycle
E0319 21:34:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:34:43.409823 543705 memory.go:191] Add success.
I0319 21:34:43.409828 543705 cpu.go:282] Add success.
I0319 21:34:43.420007 543705 net.go:648] Add success.
I0319 21:34:43.422761 543705 net.go:770] primary dev: ETH0
I0319 21:34:43.422773 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:34:43.422785 543705 net.go:698] Add success.
I0319 21:34:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:34:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:34:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:34:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:34:53.409769 543705 memory.go:184] no items to output this cycle
I0319 21:34:53.409798 543705 cpu.go:275] no items to output this cycle
E0319 21:35:03.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:35:03.409816 543705 memory.go:184] no items to output this cycle
I0319 21:35:03.409824 543705 cpu.go:275] no items to output this cycle
E0319 21:35:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:35:13.409798 543705 memory.go:191] Add success.
I0319 21:35:13.409800 543705 cpu.go:282] Add success.
W0319 21:35:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:35:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:35:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:35:13.419668 543705 net.go:648] Add success.
I0319 21:35:13.422294 543705 net.go:770] primary dev: ETH0
I0319 21:35:13.422308 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:35:13.422320 543705 net.go:698] Add success.
I0319 21:35:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:35:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:35:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 21:35:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:35:14.456591 543705 disk_worker.go:494] system disk:vda1
I0319 21:35:14.456622 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:35:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:35:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:35:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:35:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:35:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:35:21.085685 543705 disk_info.go:125] begin check local disk info of client
I0319 21:35:21.088112 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:35:21.088117 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b5a80 0xc0002b5ac0]
E0319 21:35:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:35:23.409786 543705 memory.go:184] no items to output this cycle
I0319 21:35:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 21:35:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:35:33.409797 543705 memory.go:184] no items to output this cycle
I0319 21:35:33.409798 543705 cpu.go:275] no items to output this cycle
E0319 21:35:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:35:43.409784 543705 memory.go:191] Add success.
I0319 21:35:43.409803 543705 cpu.go:282] Add success.
I0319 21:35:43.419856 543705 net.go:648] Add success.
I0319 21:35:43.422477 543705 net.go:770] primary dev: ETH0
I0319 21:35:43.422492 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:35:43.422507 543705 net.go:698] Add success.
I0319 21:35:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:35:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:35:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:35:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:35:53.409779 543705 memory.go:184] no items to output this cycle
I0319 21:35:53.409794 543705 cpu.go:275] no items to output this cycle
E0319 21:36:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:36:03.409787 543705 memory.go:184] no items to output this cycle
I0319 21:36:03.409787 543705 cpu.go:275] no items to output this cycle
E0319 21:36:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:36:13.409914 543705 memory.go:191] Add success.
W0319 21:36:13.409952 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:36:13.409965 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:36:13.409968 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:36:13.410004 543705 cpu.go:282] Add success.
I0319 21:36:13.419755 543705 net.go:648] Add success.
I0319 21:36:13.422524 543705 net.go:770] primary dev: ETH0
I0319 21:36:13.422537 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:36:13.422548 543705 net.go:698] Add success.
I0319 21:36:13.471425 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0169740d-ec40-4e15-acec-b233fc19f0c9","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 21:36:13.471456 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 21:36:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:36:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:36:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0319 21:36:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:36:14.456575 543705 disk_worker.go:494] system disk:vda1
I0319 21:36:14.456606 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:36:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:36:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:36:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:36:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:36:16.472429 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:36:21.089675 543705 disk_info.go:125] begin check local disk info of client
I0319 21:36:21.092108 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:36:21.092114 543705 disk_info.go:196] parse disk info done, disk is : [0xc00060b6c0 0xc00060b700]
E0319 21:36:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:36:23.409790 543705 memory.go:184] no items to output this cycle
I0319 21:36:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 21:36:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:36:33.409793 543705 memory.go:184] no items to output this cycle
I0319 21:36:33.409804 543705 cpu.go:275] no items to output this cycle
I0319 21:36:37.905888 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 21:36:37.905895 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 21:36:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:36:43.410676 543705 memory.go:191] Add success.
I0319 21:36:43.409815 543705 cpu.go:282] Add success.
I0319 21:36:43.420361 543705 net.go:648] Add success.
I0319 21:36:43.423090 543705 net.go:770] primary dev: ETH0
I0319 21:36:43.423103 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:36:43.423115 543705 net.go:698] Add success.
I0319 21:36:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:36:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:36:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:36:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:36:53.409777 543705 memory.go:184] no items to output this cycle
I0319 21:36:53.409805 543705 cpu.go:275] no items to output this cycle
E0319 21:37:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:37:03.409807 543705 memory.go:184] no items to output this cycle
I0319 21:37:03.409819 543705 cpu.go:275] no items to output this cycle
E0319 21:37:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:37:13.409824 543705 memory.go:191] Add success.
I0319 21:37:13.409831 543705 cpu.go:282] Add success.
W0319 21:37:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:37:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:37:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:37:13.420174 543705 net.go:648] Add success.
I0319 21:37:13.422719 543705 net.go:770] primary dev: ETH0
I0319 21:37:13.422731 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:37:13.422743 543705 net.go:698] Add success.
I0319 21:37:13.453445 543705 event_worker.go:152] Polling the log file for events...
W0319 21:37:14.455100 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:37:14.455159 543705 disk_worker.go:708] disk space is not compliant
W0319 21:37:14.455162 543705 disk_worker.go:728] disk inode is not compliant
E0319 21:37:14.456916 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 21:37:14.456926 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 21:37:14.456932 543705 custom_config.go:64] query custom config with name: gpu
I0319 21:37:14.456997 543705 disk_worker.go:494] system disk:vda1
I0319 21:37:14.457036 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 21:37:15.456842 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 21:37:15.456851 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:37:16.457962 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 21:37:16.457971 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 21:37:16.458014 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:37:16.458031 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:37:16.472397 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:37:21.093671 543705 disk_info.go:125] begin check local disk info of client
I0319 21:37:21.096123 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:37:21.096129 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e8540 0xc0003e8580]
E0319 21:37:23.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:37:23.409761 543705 memory.go:184] no items to output this cycle
I0319 21:37:23.409787 543705 cpu.go:275] no items to output this cycle
E0319 21:37:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:37:33.409784 543705 memory.go:184] no items to output this cycle
I0319 21:37:33.409806 543705 cpu.go:275] no items to output this cycle
E0319 21:37:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:37:43.409816 543705 memory.go:191] Add success.
I0319 21:37:43.409825 543705 cpu.go:282] Add success.
I0319 21:37:43.419989 543705 net.go:648] Add success.
I0319 21:37:43.422765 543705 net.go:770] primary dev: ETH0
I0319 21:37:43.422779 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:37:43.422791 543705 net.go:698] Add success.
I0319 21:37:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:37:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:37:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:37:53.410187 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:37:53.410202 543705 memory.go:184] no items to output this cycle
I0319 21:37:53.410226 543705 cpu.go:275] no items to output this cycle
E0319 21:38:03.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:38:03.409816 543705 memory.go:184] no items to output this cycle
I0319 21:38:03.409823 543705 cpu.go:275] no items to output this cycle
I0319 21:38:13.409908 543705 cpu.go:282] Add success.
E0319 21:38:13.409966 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:38:13.409994 543705 memory.go:191] Add success.
W0319 21:38:13.410027 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:38:13.410049 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:38:13.410053 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:38:13.419762 543705 net.go:648] Add success.
I0319 21:38:13.422582 543705 net.go:770] primary dev: ETH0
I0319 21:38:13.422595 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:38:13.422606 543705 net.go:698] Add success.
I0319 21:38:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:38:14.455106 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:38:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0319 21:38:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:38:14.456570 543705 disk_worker.go:494] system disk:vda1
I0319 21:38:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:38:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:38:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:38:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:38:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:38:16.472444 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:38:21.097677 543705 disk_info.go:125] begin check local disk info of client
I0319 21:38:21.100142 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:38:21.100148 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4d00 0xc0000c4d40]
E0319 21:38:23.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:38:23.409805 543705 memory.go:184] no items to output this cycle
I0319 21:38:23.409819 543705 cpu.go:275] no items to output this cycle
E0319 21:38:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:38:33.409796 543705 memory.go:184] no items to output this cycle
I0319 21:38:33.409825 543705 cpu.go:275] no items to output this cycle
E0319 21:38:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:38:43.409803 543705 memory.go:191] Add success.
I0319 21:38:43.409810 543705 cpu.go:282] Add success.
I0319 21:38:43.419894 543705 net.go:648] Add success.
I0319 21:38:43.422853 543705 net.go:770] primary dev: ETH0
I0319 21:38:43.422867 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:38:43.422880 543705 net.go:698] Add success.
I0319 21:38:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:38:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:38:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:38:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:38:53.409793 543705 memory.go:184] no items to output this cycle
I0319 21:38:53.409802 543705 cpu.go:275] no items to output this cycle
E0319 21:39:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:39:03.409800 543705 memory.go:184] no items to output this cycle
I0319 21:39:03.409807 543705 cpu.go:275] no items to output this cycle
E0319 21:39:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:39:13.409897 543705 memory.go:191] Add success.
W0319 21:39:13.409930 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:39:13.409943 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:39:13.409946 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:39:13.409947 543705 cpu.go:282] Add success.
I0319 21:39:13.419730 543705 net.go:648] Add success.
I0319 21:39:13.422548 543705 net.go:770] primary dev: ETH0
I0319 21:39:13.422562 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:39:13.422575 543705 net.go:698] Add success.
I0319 21:39:13.469022 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f3e888c7-bbf7-4e3c-8dbd-a34726dd0072","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 21:39:13.469053 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 21:39:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:39:14.455100 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:39:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0319 21:39:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:39:14.456489 543705 disk_worker.go:494] system disk:vda1
I0319 21:39:14.456532 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:39:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:39:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:39:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:39:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:39:16.472386 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:39:21.101672 543705 disk_info.go:125] begin check local disk info of client
I0319 21:39:21.104129 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:39:21.104135 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003be840 0xc0003be880]
E0319 21:39:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:39:23.409791 543705 memory.go:184] no items to output this cycle
I0319 21:39:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 21:39:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:39:33.409783 543705 memory.go:184] no items to output this cycle
I0319 21:39:33.409820 543705 cpu.go:275] no items to output this cycle
I0319 21:39:37.906036 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 21:39:37.906043 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 21:39:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:39:43.410744 543705 memory.go:191] Add success.
I0319 21:39:43.409796 543705 cpu.go:282] Add success.
I0319 21:39:43.420463 543705 net.go:648] Add success.
I0319 21:39:43.423375 543705 net.go:770] primary dev: ETH0
I0319 21:39:43.423396 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:39:43.423415 543705 net.go:698] Add success.
I0319 21:39:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:39:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:39:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:39:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:39:53.409771 543705 memory.go:184] no items to output this cycle
I0319 21:39:53.409778 543705 cpu.go:275] no items to output this cycle
E0319 21:40:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:40:03.409806 543705 memory.go:184] no items to output this cycle
I0319 21:40:03.409817 543705 cpu.go:275] no items to output this cycle
E0319 21:40:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:40:13.409786 543705 memory.go:191] Add success.
W0319 21:40:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 21:40:13.409816 543705 cpu.go:282] Add success.
W0319 21:40:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:40:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:40:13.420401 543705 net.go:648] Add success.
I0319 21:40:13.423124 543705 net.go:770] primary dev: ETH0
I0319 21:40:13.423137 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:40:13.423148 543705 net.go:698] Add success.
I0319 21:40:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:40:14.455137 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:40:14.455147 543705 disk_worker.go:708] disk space is not compliant
W0319 21:40:14.455150 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:40:14.456494 543705 disk_worker.go:494] system disk:vda1
I0319 21:40:14.456535 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:40:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:40:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:40:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:40:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:40:16.472377 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:40:21.105672 543705 disk_info.go:125] begin check local disk info of client
I0319 21:40:21.108183 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:40:21.108189 543705 disk_info.go:196] parse disk info done, disk is : [0xc000328980 0xc0003289c0]
E0319 21:40:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:40:23.409766 543705 memory.go:184] no items to output this cycle
I0319 21:40:23.409790 543705 cpu.go:275] no items to output this cycle
E0319 21:40:33.409802 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:40:33.409822 543705 memory.go:184] no items to output this cycle
I0319 21:40:33.409832 543705 cpu.go:275] no items to output this cycle
E0319 21:40:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:40:43.409794 543705 memory.go:191] Add success.
I0319 21:40:43.409819 543705 cpu.go:282] Add success.
I0319 21:40:43.419897 543705 net.go:648] Add success.
I0319 21:40:43.422722 543705 net.go:770] primary dev: ETH0
I0319 21:40:43.422747 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:40:43.422760 543705 net.go:698] Add success.
I0319 21:40:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:40:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:40:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:40:53.409775 543705 cpu.go:275] no items to output this cycle
E0319 21:40:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:40:53.409791 543705 memory.go:184] no items to output this cycle
E0319 21:41:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:41:03.409812 543705 memory.go:184] no items to output this cycle
I0319 21:41:03.409823 543705 cpu.go:275] no items to output this cycle
E0319 21:41:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:41:13.409791 543705 memory.go:191] Add success.
I0319 21:41:13.409794 543705 cpu.go:282] Add success.
W0319 21:41:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:41:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:41:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:41:13.420043 543705 net.go:648] Add success.
I0319 21:41:13.423027 543705 net.go:770] primary dev: ETH0
I0319 21:41:13.423040 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:41:13.423052 543705 net.go:698] Add success.
I0319 21:41:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:41:14.455116 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:41:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 21:41:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:41:14.457051 543705 disk_worker.go:494] system disk:vda1
I0319 21:41:14.457080 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:41:15.455950 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:41:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:41:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:41:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:41:16.472376 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:41:21.109676 543705 disk_info.go:125] begin check local disk info of client
I0319 21:41:21.112001 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:41:21.112007 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047c380 0xc00047c3c0]
E0319 21:41:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:41:23.409786 543705 memory.go:184] no items to output this cycle
I0319 21:41:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 21:41:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:41:33.409795 543705 memory.go:184] no items to output this cycle
I0319 21:41:33.409808 543705 cpu.go:275] no items to output this cycle
E0319 21:41:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:41:43.409792 543705 memory.go:191] Add success.
I0319 21:41:43.409794 543705 cpu.go:282] Add success.
I0319 21:41:43.419851 543705 net.go:648] Add success.
I0319 21:41:43.422631 543705 net.go:770] primary dev: ETH0
I0319 21:41:43.422644 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:41:43.422657 543705 net.go:698] Add success.
I0319 21:41:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:41:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:41:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:41:53.410345 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:41:53.410366 543705 memory.go:184] no items to output this cycle
I0319 21:41:53.410375 543705 cpu.go:275] no items to output this cycle
E0319 21:42:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:42:03.409786 543705 memory.go:184] no items to output this cycle
I0319 21:42:03.409788 543705 cpu.go:275] no items to output this cycle
E0319 21:42:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:42:13.409830 543705 memory.go:191] Add success.
I0319 21:42:13.409834 543705 cpu.go:282] Add success.
W0319 21:42:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:42:13.409878 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:42:13.409882 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:42:13.420113 543705 net.go:648] Add success.
I0319 21:42:13.423186 543705 net.go:770] primary dev: ETH0
I0319 21:42:13.423200 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:42:13.423213 543705 net.go:698] Add success.
I0319 21:42:13.470055 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2d97d857-a847-4bbb-9fd2-3aed89f2a4ae","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 21:42:13.470087 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 21:42:14.455202 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:42:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0319 21:42:14.455216 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:42:14.457346 543705 disk_worker.go:494] system disk:vda1
E0319 21:42:14.457349 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 21:42:14.457362 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 21:42:14.457367 543705 custom_config.go:64] query custom config with name: gpu
I0319 21:42:14.457513 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:42:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
E0319 21:42:15.456837 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 21:42:16.457932 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 21:42:16.457932 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 21:42:16.457986 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:42:16.458005 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:42:16.472336 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:42:21.113688 543705 disk_info.go:125] begin check local disk info of client
I0319 21:42:21.116074 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:42:21.116080 543705 disk_info.go:196] parse disk info done, disk is : [0xc000330780 0xc0003307c0]
E0319 21:42:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:42:23.409786 543705 memory.go:184] no items to output this cycle
I0319 21:42:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 21:42:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:42:33.409814 543705 memory.go:184] no items to output this cycle
I0319 21:42:33.409829 543705 cpu.go:275] no items to output this cycle
I0319 21:42:37.908236 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 21:42:37.908243 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 21:42:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:42:43.410787 543705 memory.go:191] Add success.
I0319 21:42:43.409802 543705 cpu.go:282] Add success.
I0319 21:42:43.420486 543705 net.go:648] Add success.
I0319 21:42:43.423339 543705 net.go:770] primary dev: ETH0
I0319 21:42:43.423353 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:42:43.423366 543705 net.go:698] Add success.
I0319 21:42:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:42:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:42:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:42:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:42:53.409768 543705 memory.go:184] no items to output this cycle
I0319 21:42:53.409800 543705 cpu.go:275] no items to output this cycle
E0319 21:43:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:43:03.409810 543705 memory.go:184] no items to output this cycle
I0319 21:43:03.409824 543705 cpu.go:275] no items to output this cycle
E0319 21:43:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:43:13.409789 543705 memory.go:191] Add success.
I0319 21:43:13.409807 543705 cpu.go:282] Add success.
W0319 21:43:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:43:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:43:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:43:13.420138 543705 net.go:648] Add success.
I0319 21:43:13.423082 543705 net.go:770] primary dev: ETH0
I0319 21:43:13.423095 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:43:13.423106 543705 net.go:698] Add success.
I0319 21:43:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:43:14.455152 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:43:14.455163 543705 disk_worker.go:708] disk space is not compliant
W0319 21:43:14.455165 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:43:14.456496 543705 disk_worker.go:494] system disk:vda1
I0319 21:43:14.456541 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:43:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:43:16.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:43:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:43:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:43:16.472430 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:43:21.117671 543705 disk_info.go:125] begin check local disk info of client
I0319 21:43:21.120123 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:43:21.120132 543705 disk_info.go:196] parse disk info done, disk is : [0xc000369280 0xc0003692c0]
E0319 21:43:23.410371 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:43:23.410386 543705 memory.go:184] no items to output this cycle
I0319 21:43:23.410430 543705 cpu.go:275] no items to output this cycle
E0319 21:43:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:43:33.409790 543705 memory.go:184] no items to output this cycle
I0319 21:43:33.409821 543705 cpu.go:275] no items to output this cycle
E0319 21:43:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:43:43.409826 543705 memory.go:191] Add success.
I0319 21:43:43.409831 543705 cpu.go:282] Add success.
I0319 21:43:43.419982 543705 net.go:648] Add success.
I0319 21:43:43.423043 543705 net.go:770] primary dev: ETH0
I0319 21:43:43.423064 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:43:43.423080 543705 net.go:698] Add success.
I0319 21:43:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:43:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:43:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:43:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:43:53.409771 543705 memory.go:184] no items to output this cycle
I0319 21:43:53.409798 543705 cpu.go:275] no items to output this cycle
E0319 21:44:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:44:03.409812 543705 memory.go:184] no items to output this cycle
I0319 21:44:03.409823 543705 cpu.go:275] no items to output this cycle
E0319 21:44:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:44:13.409808 543705 memory.go:191] Add success.
I0319 21:44:13.409811 543705 cpu.go:282] Add success.
W0319 21:44:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:44:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:44:13.409853 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:44:13.420182 543705 net.go:648] Add success.
I0319 21:44:13.422855 543705 net.go:770] primary dev: ETH0
I0319 21:44:13.422869 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:44:13.422883 543705 net.go:698] Add success.
I0319 21:44:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:44:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:44:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0319 21:44:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:44:14.456574 543705 disk_worker.go:494] system disk:vda1
I0319 21:44:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:44:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:44:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:44:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:44:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:44:16.472393 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:44:21.121677 543705 disk_info.go:125] begin check local disk info of client
I0319 21:44:21.124087 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:44:21.124093 543705 disk_info.go:196] parse disk info done, disk is : [0xc000231280 0xc0002312c0]
E0319 21:44:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:44:23.409804 543705 memory.go:184] no items to output this cycle
I0319 21:44:23.409815 543705 cpu.go:275] no items to output this cycle
E0319 21:44:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:44:33.409795 543705 memory.go:184] no items to output this cycle
I0319 21:44:33.409916 543705 cpu.go:275] no items to output this cycle
E0319 21:44:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:44:43.409802 543705 memory.go:191] Add success.
I0319 21:44:43.409809 543705 cpu.go:282] Add success.
I0319 21:44:43.419897 543705 net.go:648] Add success.
I0319 21:44:43.422586 543705 net.go:770] primary dev: ETH0
I0319 21:44:43.422602 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:44:43.422617 543705 net.go:698] Add success.
I0319 21:44:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:44:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:44:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:44:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:44:53.409789 543705 memory.go:184] no items to output this cycle
I0319 21:44:53.409793 543705 cpu.go:275] no items to output this cycle
E0319 21:45:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:45:03.409776 543705 memory.go:184] no items to output this cycle
I0319 21:45:03.409882 543705 cpu.go:275] no items to output this cycle
E0319 21:45:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:45:13.409826 543705 memory.go:191] Add success.
I0319 21:45:13.409832 543705 cpu.go:282] Add success.
W0319 21:45:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:45:13.409877 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:45:13.409881 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:45:13.420156 543705 net.go:648] Add success.
I0319 21:45:13.423102 543705 net.go:770] primary dev: ETH0
I0319 21:45:13.423115 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:45:13.423127 543705 net.go:698] Add success.
I0319 21:45:13.468939 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c07c43c8-bfc1-429b-b071-0dd4dafac9b0","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 21:45:13.468972 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 21:45:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:45:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:45:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0319 21:45:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:45:14.456605 543705 disk_worker.go:494] system disk:vda1
I0319 21:45:14.456638 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:45:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:45:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:45:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:45:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:45:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:45:21.125673 543705 disk_info.go:125] begin check local disk info of client
I0319 21:45:21.128125 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:45:21.128131 543705 disk_info.go:196] parse disk info done, disk is : [0xc00024eb00 0xc00024eb40]
E0319 21:45:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:45:23.409766 543705 memory.go:184] no items to output this cycle
I0319 21:45:23.409797 543705 cpu.go:275] no items to output this cycle
E0319 21:45:33.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:45:33.409929 543705 memory.go:184] no items to output this cycle
I0319 21:45:33.409952 543705 cpu.go:275] no items to output this cycle
I0319 21:45:37.909736 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 21:45:37.909742 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 21:45:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:45:43.410659 543705 memory.go:191] Add success.
I0319 21:45:43.409815 543705 cpu.go:282] Add success.
I0319 21:45:43.420339 543705 net.go:648] Add success.
I0319 21:45:43.423094 543705 net.go:770] primary dev: ETH0
I0319 21:45:43.423105 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:45:43.423118 543705 net.go:698] Add success.
I0319 21:45:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:45:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:45:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:45:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:45:53.409781 543705 memory.go:184] no items to output this cycle
I0319 21:45:53.409781 543705 cpu.go:275] no items to output this cycle
E0319 21:46:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:46:03.409791 543705 cpu.go:275] no items to output this cycle
I0319 21:46:03.409801 543705 memory.go:184] no items to output this cycle
E0319 21:46:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:46:13.409789 543705 memory.go:191] Add success.
I0319 21:46:13.409793 543705 cpu.go:282] Add success.
W0319 21:46:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:46:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:46:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:46:13.420067 543705 net.go:648] Add success.
I0319 21:46:13.422858 543705 net.go:770] primary dev: ETH0
I0319 21:46:13.422871 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:46:13.422884 543705 net.go:698] Add success.
I0319 21:46:14.454978 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:46:14.455174 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:46:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 21:46:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:46:14.456559 543705 disk_worker.go:494] system disk:vda1
I0319 21:46:14.456588 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:46:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:46:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:46:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:46:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:46:16.472392 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:46:21.129675 543705 disk_info.go:125] begin check local disk info of client
I0319 21:46:21.132157 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:46:21.132163 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003bf040 0xc0003bf080]
E0319 21:46:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:46:23.409770 543705 memory.go:184] no items to output this cycle
I0319 21:46:23.409793 543705 cpu.go:275] no items to output this cycle
E0319 21:46:33.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:46:33.409820 543705 memory.go:184] no items to output this cycle
I0319 21:46:33.409831 543705 cpu.go:275] no items to output this cycle
E0319 21:46:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:46:43.409794 543705 memory.go:191] Add success.
I0319 21:46:43.409812 543705 cpu.go:282] Add success.
I0319 21:46:43.419983 543705 net.go:648] Add success.
I0319 21:46:43.422628 543705 net.go:770] primary dev: ETH0
I0319 21:46:43.422640 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:46:43.422652 543705 net.go:698] Add success.
I0319 21:46:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:46:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:46:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:46:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:46:53.409782 543705 memory.go:184] no items to output this cycle
I0319 21:46:53.409786 543705 cpu.go:275] no items to output this cycle
E0319 21:47:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:47:03.409805 543705 memory.go:184] no items to output this cycle
I0319 21:47:03.409817 543705 cpu.go:275] no items to output this cycle
E0319 21:47:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:47:13.409811 543705 memory.go:191] Add success.
I0319 21:47:13.409820 543705 cpu.go:282] Add success.
W0319 21:47:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:47:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:47:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:47:13.420299 543705 net.go:648] Add success.
I0319 21:47:13.423254 543705 net.go:770] primary dev: ETH0
I0319 21:47:13.423269 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:47:13.423280 543705 net.go:698] Add success.
I0319 21:47:13.452792 543705 event_worker.go:152] Polling the log file for events...
W0319 21:47:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:47:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 21:47:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:47:14.456783 543705 disk_worker.go:494] system disk:vda1
I0319 21:47:14.456826 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 21:47:14.456994 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 21:47:14.457003 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 21:47:14.457009 543705 custom_config.go:64] query custom config with name: gpu
E0319 21:47:15.456842 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 21:47:15.456850 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:47:16.457955 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 21:47:16.457966 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 21:47:16.458007 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:47:16.458037 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:47:16.472398 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:47:21.133672 543705 disk_info.go:125] begin check local disk info of client
I0319 21:47:21.136067 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:47:21.136073 543705 disk_info.go:196] parse disk info done, disk is : [0xc000390c80 0xc000390cc0]
E0319 21:47:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:47:23.409786 543705 memory.go:184] no items to output this cycle
I0319 21:47:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 21:47:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:47:33.409792 543705 memory.go:184] no items to output this cycle
I0319 21:47:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 21:47:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:47:43.409788 543705 memory.go:191] Add success.
I0319 21:47:43.409818 543705 cpu.go:282] Add success.
I0319 21:47:43.419699 543705 net.go:770] primary dev: ETH0
I0319 21:47:43.419715 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:47:43.419730 543705 net.go:698] Add success.
I0319 21:47:43.420090 543705 net.go:648] Add success.
I0319 21:47:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:47:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:47:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:47:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:47:53.409770 543705 memory.go:184] no items to output this cycle
I0319 21:47:53.409797 543705 cpu.go:275] no items to output this cycle
E0319 21:48:03.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:48:03.409812 543705 memory.go:184] no items to output this cycle
I0319 21:48:03.409824 543705 cpu.go:275] no items to output this cycle
E0319 21:48:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:48:13.409793 543705 memory.go:191] Add success.
I0319 21:48:13.409797 543705 cpu.go:282] Add success.
W0319 21:48:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:48:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:48:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:48:13.420035 543705 net.go:648] Add success.
I0319 21:48:13.422713 543705 net.go:770] primary dev: ETH0
I0319 21:48:13.422726 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:48:13.422739 543705 net.go:698] Add success.
I0319 21:48:13.854475 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7dc12a95-8b72-4c88-8c68-620246dffaab","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 21:48:13.854513 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 21:48:14.453981 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:48:14.454238 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:48:14.454248 543705 disk_worker.go:708] disk space is not compliant
W0319 21:48:14.454250 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:48:14.455794 543705 disk_worker.go:494] system disk:vda1
I0319 21:48:14.455825 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:48:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:48:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:48:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:48:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:48:16.472403 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:48:21.137674 543705 disk_info.go:125] begin check local disk info of client
I0319 21:48:21.140124 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:48:21.140130 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e15c0 0xc0003e1600]
E0319 21:48:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:48:23.409791 543705 memory.go:184] no items to output this cycle
I0319 21:48:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 21:48:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:48:33.409796 543705 memory.go:184] no items to output this cycle
I0319 21:48:33.409801 543705 cpu.go:275] no items to output this cycle
I0319 21:48:37.912255 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 21:48:37.912262 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 21:48:43.409858 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:48:43.409987 543705 cpu.go:282] Add success.
I0319 21:48:43.410701 543705 memory.go:191] Add success.
I0319 21:48:43.419709 543705 net.go:648] Add success.
I0319 21:48:43.422459 543705 net.go:770] primary dev: ETH0
I0319 21:48:43.422472 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:48:43.422483 543705 net.go:698] Add success.
I0319 21:48:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:48:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:48:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:48:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:48:53.409777 543705 memory.go:184] no items to output this cycle
I0319 21:48:53.409783 543705 cpu.go:275] no items to output this cycle
E0319 21:49:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:49:03.409788 543705 memory.go:184] no items to output this cycle
I0319 21:49:03.409794 543705 cpu.go:275] no items to output this cycle
E0319 21:49:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:49:13.409790 543705 memory.go:191] Add success.
I0319 21:49:13.409790 543705 cpu.go:282] Add success.
W0319 21:49:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:49:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:49:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:49:13.420208 543705 net.go:648] Add success.
I0319 21:49:13.423284 543705 net.go:770] primary dev: ETH0
I0319 21:49:13.423296 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:49:13.423309 543705 net.go:698] Add success.
I0319 21:49:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:49:14.455203 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:49:14.455213 543705 disk_worker.go:708] disk space is not compliant
W0319 21:49:14.455216 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:49:14.456565 543705 disk_worker.go:494] system disk:vda1
I0319 21:49:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:49:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:49:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:49:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:49:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:49:16.472411 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:49:21.141690 543705 disk_info.go:125] begin check local disk info of client
I0319 21:49:21.144090 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:49:21.144096 543705 disk_info.go:196] parse disk info done, disk is : [0xc000252580 0xc0002525c0]
E0319 21:49:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:49:23.409787 543705 memory.go:184] no items to output this cycle
I0319 21:49:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 21:49:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:49:33.409787 543705 memory.go:184] no items to output this cycle
I0319 21:49:33.409817 543705 cpu.go:275] no items to output this cycle
E0319 21:49:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:49:43.409795 543705 memory.go:191] Add success.
I0319 21:49:43.409798 543705 cpu.go:282] Add success.
I0319 21:49:43.420325 543705 net.go:648] Add success.
I0319 21:49:43.423291 543705 net.go:770] primary dev: ETH0
I0319 21:49:43.423309 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:49:43.423323 543705 net.go:698] Add success.
I0319 21:49:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:49:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:49:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:49:53.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:49:53.409763 543705 memory.go:184] no items to output this cycle
I0319 21:49:53.409796 543705 cpu.go:275] no items to output this cycle
E0319 21:50:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:50:03.409786 543705 memory.go:184] no items to output this cycle
I0319 21:50:03.409791 543705 cpu.go:275] no items to output this cycle
E0319 21:50:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:50:13.409793 543705 memory.go:191] Add success.
I0319 21:50:13.409794 543705 cpu.go:282] Add success.
W0319 21:50:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:50:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:50:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:50:13.420146 543705 net.go:648] Add success.
I0319 21:50:13.422918 543705 net.go:770] primary dev: ETH0
I0319 21:50:13.422933 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:50:13.422947 543705 net.go:698] Add success.
I0319 21:50:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:50:14.455104 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:50:14.455164 543705 disk_worker.go:708] disk space is not compliant
W0319 21:50:14.455167 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:50:14.456499 543705 disk_worker.go:494] system disk:vda1
I0319 21:50:14.456543 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:50:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:50:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:50:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:50:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:50:16.472390 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:50:21.145672 543705 disk_info.go:125] begin check local disk info of client
I0319 21:50:21.148053 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:50:21.148059 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e1b00 0xc0003e1bc0]
E0319 21:50:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:50:23.409786 543705 memory.go:184] no items to output this cycle
I0319 21:50:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 21:50:33.409805 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:50:33.409827 543705 memory.go:184] no items to output this cycle
I0319 21:50:33.409808 543705 cpu.go:275] no items to output this cycle
E0319 21:50:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:50:43.409778 543705 memory.go:191] Add success.
I0319 21:50:43.409820 543705 cpu.go:282] Add success.
I0319 21:50:43.420094 543705 net.go:648] Add success.
I0319 21:50:43.422878 543705 net.go:770] primary dev: ETH0
I0319 21:50:43.422891 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:50:43.422903 543705 net.go:698] Add success.
I0319 21:50:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:50:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:50:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:50:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:50:53.409770 543705 memory.go:184] no items to output this cycle
I0319 21:50:53.409794 543705 cpu.go:275] no items to output this cycle
E0319 21:51:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:51:03.409804 543705 memory.go:184] no items to output this cycle
I0319 21:51:03.409820 543705 cpu.go:275] no items to output this cycle
E0319 21:51:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:51:13.409779 543705 memory.go:191] Add success.
W0319 21:51:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 21:51:13.409811 543705 cpu.go:282] Add success.
W0319 21:51:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:51:13.409819 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:51:13.420060 543705 net.go:648] Add success.
I0319 21:51:13.422893 543705 net.go:770] primary dev: ETH0
I0319 21:51:13.422911 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:51:13.422925 543705 net.go:698] Add success.
I0319 21:51:13.580373 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0f8a65e0-32b6-4f03-8525-96aaf2e36808","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 21:51:13.580408 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 21:51:14.453984 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:51:14.454233 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:51:14.454243 543705 disk_worker.go:708] disk space is not compliant
W0319 21:51:14.454245 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:51:14.455759 543705 disk_worker.go:494] system disk:vda1
I0319 21:51:14.455796 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:51:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:51:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:51:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:51:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:51:16.472379 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:51:21.149674 543705 disk_info.go:125] begin check local disk info of client
I0319 21:51:21.152139 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:51:21.152145 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048b240 0xc00048b280]
E0319 21:51:23.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:51:23.409760 543705 memory.go:184] no items to output this cycle
I0319 21:51:23.409798 543705 cpu.go:275] no items to output this cycle
I0319 21:51:33.409821 543705 cpu.go:275] no items to output this cycle
E0319 21:51:33.409822 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:51:33.409845 543705 memory.go:184] no items to output this cycle
I0319 21:51:37.913748 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 21:51:37.913756 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 21:51:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:51:43.410682 543705 memory.go:191] Add success.
I0319 21:51:43.409836 543705 cpu.go:282] Add success.
I0319 21:51:43.420393 543705 net.go:648] Add success.
I0319 21:51:43.423684 543705 net.go:770] primary dev: ETH0
I0319 21:51:43.423699 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:51:43.423714 543705 net.go:698] Add success.
I0319 21:51:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:51:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:51:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:51:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:51:53.409773 543705 memory.go:184] no items to output this cycle
I0319 21:51:53.409788 543705 cpu.go:275] no items to output this cycle
E0319 21:52:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:52:03.409779 543705 memory.go:184] no items to output this cycle
I0319 21:52:03.409801 543705 cpu.go:275] no items to output this cycle
E0319 21:52:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:52:13.409782 543705 memory.go:191] Add success.
W0319 21:52:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 21:52:13.409809 543705 cpu.go:282] Add success.
W0319 21:52:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:52:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:52:13.420064 543705 net.go:648] Add success.
I0319 21:52:13.422740 543705 net.go:770] primary dev: ETH0
I0319 21:52:13.422753 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:52:13.422766 543705 net.go:698] Add success.
W0319 21:52:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:52:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0319 21:52:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:52:14.456799 543705 disk_worker.go:494] system disk:vda1
I0319 21:52:14.456837 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 21:52:14.457106 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 21:52:14.457114 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 21:52:14.457118 543705 custom_config.go:64] query custom config with name: gpu
E0319 21:52:15.456804 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 21:52:15.456813 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:52:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 21:52:16.457977 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 21:52:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:52:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:52:16.472379 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:52:21.153678 543705 disk_info.go:125] begin check local disk info of client
I0319 21:52:21.156104 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:52:21.156110 543705 disk_info.go:196] parse disk info done, disk is : [0xc000252ac0 0xc000252b00]
E0319 21:52:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:52:23.409796 543705 memory.go:184] no items to output this cycle
I0319 21:52:23.409809 543705 cpu.go:275] no items to output this cycle
E0319 21:52:33.409895 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:52:33.409917 543705 memory.go:184] no items to output this cycle
I0319 21:52:33.410098 543705 cpu.go:275] no items to output this cycle
E0319 21:52:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:52:43.409806 543705 cpu.go:282] Add success.
I0319 21:52:43.409811 543705 memory.go:191] Add success.
I0319 21:52:43.419881 543705 net.go:648] Add success.
I0319 21:52:43.423781 543705 net.go:770] primary dev: ETH0
I0319 21:52:43.423800 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:52:43.423814 543705 net.go:698] Add success.
I0319 21:52:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:52:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:52:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:52:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:52:53.409765 543705 memory.go:184] no items to output this cycle
I0319 21:52:53.409801 543705 cpu.go:275] no items to output this cycle
E0319 21:53:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:53:03.409813 543705 memory.go:184] no items to output this cycle
I0319 21:53:03.409822 543705 cpu.go:275] no items to output this cycle
E0319 21:53:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:53:13.409816 543705 memory.go:191] Add success.
I0319 21:53:13.409822 543705 cpu.go:282] Add success.
W0319 21:53:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:53:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:53:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:53:13.420062 543705 net.go:648] Add success.
I0319 21:53:13.422893 543705 net.go:770] primary dev: ETH0
I0319 21:53:13.422909 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:53:13.422922 543705 net.go:698] Add success.
I0319 21:53:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:53:14.455183 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:53:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 21:53:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:53:14.456587 543705 disk_worker.go:494] system disk:vda1
I0319 21:53:14.456617 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:53:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:53:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:53:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:53:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:53:16.472443 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:53:21.157677 543705 disk_info.go:125] begin check local disk info of client
I0319 21:53:21.160117 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:53:21.160123 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ab880 0xc0003ab8c0]
E0319 21:53:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:53:23.409794 543705 memory.go:184] no items to output this cycle
I0319 21:53:23.409809 543705 cpu.go:275] no items to output this cycle
E0319 21:53:33.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:53:33.409824 543705 memory.go:184] no items to output this cycle
I0319 21:53:33.409830 543705 cpu.go:275] no items to output this cycle
E0319 21:53:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:53:43.409785 543705 memory.go:191] Add success.
I0319 21:53:43.409804 543705 cpu.go:282] Add success.
I0319 21:53:43.419901 543705 net.go:648] Add success.
I0319 21:53:43.422676 543705 net.go:770] primary dev: ETH0
I0319 21:53:43.422689 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:53:43.422703 543705 net.go:698] Add success.
I0319 21:53:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:53:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:53:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:53:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:53:53.409803 543705 memory.go:184] no items to output this cycle
I0319 21:53:53.409817 543705 cpu.go:275] no items to output this cycle
E0319 21:54:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:54:03.409787 543705 memory.go:184] no items to output this cycle
I0319 21:54:03.409828 543705 cpu.go:275] no items to output this cycle
E0319 21:54:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:54:13.409821 543705 memory.go:191] Add success.
I0319 21:54:13.409826 543705 cpu.go:282] Add success.
W0319 21:54:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:54:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:54:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:54:13.420418 543705 net.go:648] Add success.
I0319 21:54:13.423386 543705 net.go:770] primary dev: ETH0
I0319 21:54:13.423398 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:54:13.423411 543705 net.go:698] Add success.
I0319 21:54:13.468893 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"28746f54-5691-41cf-830b-ebf8519766c9","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 21:54:13.468928 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 21:54:14.454980 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:54:14.455126 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:54:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0319 21:54:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:54:14.456605 543705 disk_worker.go:494] system disk:vda1
I0319 21:54:14.456637 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:54:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:54:16.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:54:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:54:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:54:16.472110 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:54:21.161680 543705 disk_info.go:125] begin check local disk info of client
I0319 21:54:21.164136 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:54:21.164143 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001e2080 0xc0001e20c0]
E0319 21:54:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:54:23.409804 543705 memory.go:184] no items to output this cycle
I0319 21:54:23.409816 543705 cpu.go:275] no items to output this cycle
E0319 21:54:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:54:33.409795 543705 memory.go:184] no items to output this cycle
I0319 21:54:33.409857 543705 cpu.go:275] no items to output this cycle
I0319 21:54:37.916271 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 21:54:37.916279 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 21:54:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:54:43.410941 543705 memory.go:191] Add success.
I0319 21:54:43.409837 543705 cpu.go:282] Add success.
I0319 21:54:43.420658 543705 net.go:648] Add success.
I0319 21:54:43.423273 543705 net.go:770] primary dev: ETH0
I0319 21:54:43.423287 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:54:43.423302 543705 net.go:698] Add success.
I0319 21:54:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:54:46.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:54:46.458056 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:54:53.410241 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:54:53.410266 543705 memory.go:184] no items to output this cycle
I0319 21:54:53.410275 543705 cpu.go:275] no items to output this cycle
E0319 21:55:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:55:03.409786 543705 memory.go:184] no items to output this cycle
I0319 21:55:03.409810 543705 cpu.go:275] no items to output this cycle
E0319 21:55:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:55:13.409793 543705 memory.go:191] Add success.
I0319 21:55:13.409815 543705 cpu.go:282] Add success.
W0319 21:55:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:55:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:55:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:55:13.420256 543705 net.go:648] Add success.
I0319 21:55:13.423052 543705 net.go:770] primary dev: ETH0
I0319 21:55:13.423066 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:55:13.423078 543705 net.go:698] Add success.
I0319 21:55:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:55:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:55:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 21:55:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:55:14.456581 543705 disk_worker.go:494] system disk:vda1
I0319 21:55:14.456617 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:55:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:55:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:55:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:55:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:55:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:55:21.165676 543705 disk_info.go:125] begin check local disk info of client
I0319 21:55:21.168147 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:55:21.168153 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048e440 0xc00048e480]
E0319 21:55:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:55:23.409808 543705 memory.go:184] no items to output this cycle
I0319 21:55:23.409821 543705 cpu.go:275] no items to output this cycle
E0319 21:55:33.409811 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:55:33.409833 543705 memory.go:184] no items to output this cycle
I0319 21:55:33.409873 543705 cpu.go:275] no items to output this cycle
E0319 21:55:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:55:43.409788 543705 memory.go:191] Add success.
I0319 21:55:43.409832 543705 cpu.go:282] Add success.
I0319 21:55:43.419721 543705 net.go:770] primary dev: ETH0
I0319 21:55:43.419737 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:55:43.419752 543705 net.go:698] Add success.
I0319 21:55:43.420122 543705 net.go:648] Add success.
I0319 21:55:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:55:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:55:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:55:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:55:53.409808 543705 memory.go:184] no items to output this cycle
I0319 21:55:53.409824 543705 cpu.go:275] no items to output this cycle
E0319 21:56:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:56:03.409787 543705 memory.go:184] no items to output this cycle
I0319 21:56:03.409785 543705 cpu.go:275] no items to output this cycle
E0319 21:56:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:56:13.409784 543705 memory.go:191] Add success.
W0319 21:56:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 21:56:13.409813 543705 cpu.go:282] Add success.
W0319 21:56:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:56:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:56:13.420123 543705 net.go:648] Add success.
I0319 21:56:13.422905 543705 net.go:770] primary dev: ETH0
I0319 21:56:13.422918 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:56:13.422930 543705 net.go:698] Add success.
I0319 21:56:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:56:14.455126 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:56:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0319 21:56:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:56:14.456574 543705 disk_worker.go:494] system disk:vda1
I0319 21:56:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:56:15.456010 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:56:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:56:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:56:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:56:16.472386 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:56:21.169672 543705 disk_info.go:125] begin check local disk info of client
I0319 21:56:21.172255 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:56:21.172260 543705 disk_info.go:196] parse disk info done, disk is : [0xc000344100 0xc000344140]
E0319 21:56:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:56:23.409776 543705 cpu.go:275] no items to output this cycle
I0319 21:56:23.409785 543705 memory.go:184] no items to output this cycle
E0319 21:56:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:56:33.409796 543705 memory.go:184] no items to output this cycle
I0319 21:56:33.409808 543705 cpu.go:275] no items to output this cycle
E0319 21:56:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:56:43.409800 543705 memory.go:191] Add success.
I0319 21:56:43.409819 543705 cpu.go:282] Add success.
I0319 21:56:43.419897 543705 net.go:648] Add success.
I0319 21:56:43.422606 543705 net.go:770] primary dev: ETH0
I0319 21:56:43.422618 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:56:43.422630 543705 net.go:698] Add success.
I0319 21:56:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:56:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:56:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:56:53.410388 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:56:53.410411 543705 memory.go:184] no items to output this cycle
I0319 21:56:53.410416 543705 cpu.go:275] no items to output this cycle
E0319 21:57:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:57:03.409809 543705 memory.go:184] no items to output this cycle
I0319 21:57:03.409816 543705 cpu.go:275] no items to output this cycle
E0319 21:57:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:57:13.409784 543705 memory.go:191] Add success.
I0319 21:57:13.409804 543705 cpu.go:282] Add success.
W0319 21:57:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:57:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:57:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:57:13.420101 543705 net.go:648] Add success.
I0319 21:57:13.429092 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 21:57:13.429167 543705 net.go:770] primary dev: ETH0
I0319 21:57:13.429180 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:57:13.429191 543705 net.go:698] Add success.
I0319 21:57:13.453726 543705 event_worker.go:152] Polling the log file for events...
I0319 21:57:13.469732 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0e36cc32-5090-4630-a6fc-2ddd725a7f96","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 21:57:13.469780 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 21:57:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:57:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0319 21:57:14.455168 543705 disk_worker.go:728] disk inode is not compliant
E0319 21:57:14.456150 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 21:57:14.456160 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 21:57:14.456165 543705 custom_config.go:64] query custom config with name: gpu
I0319 21:57:14.456447 543705 disk_worker.go:494] system disk:vda1
I0319 21:57:14.456477 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 21:57:15.456814 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 21:57:15.456822 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:57:16.457942 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 21:57:16.457942 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 21:57:16.457996 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:57:16.458016 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:57:16.472342 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:57:21.173682 543705 disk_info.go:125] begin check local disk info of client
I0319 21:57:21.176085 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:57:21.176092 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001cc000 0xc0001cc040]
E0319 21:57:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:57:23.409766 543705 memory.go:184] no items to output this cycle
I0319 21:57:23.409789 543705 cpu.go:275] no items to output this cycle
E0319 21:57:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:57:33.409799 543705 memory.go:184] no items to output this cycle
I0319 21:57:33.409812 543705 cpu.go:275] no items to output this cycle
I0319 21:57:37.917747 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 21:57:37.917755 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 21:57:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:57:43.410767 543705 memory.go:191] Add success.
I0319 21:57:43.409808 543705 cpu.go:282] Add success.
I0319 21:57:43.420688 543705 net.go:648] Add success.
I0319 21:57:43.423437 543705 net.go:770] primary dev: ETH0
I0319 21:57:43.423452 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:57:43.423467 543705 net.go:698] Add success.
I0319 21:57:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:57:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:57:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:57:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:57:53.409798 543705 memory.go:184] no items to output this cycle
I0319 21:57:53.409810 543705 cpu.go:275] no items to output this cycle
E0319 21:58:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:58:03.409785 543705 cpu.go:275] no items to output this cycle
I0319 21:58:03.409795 543705 memory.go:184] no items to output this cycle
E0319 21:58:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:58:13.409811 543705 memory.go:191] Add success.
I0319 21:58:13.409815 543705 cpu.go:282] Add success.
W0319 21:58:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:58:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:58:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:58:13.420123 543705 net.go:648] Add success.
I0319 21:58:13.422961 543705 net.go:770] primary dev: ETH0
I0319 21:58:13.422976 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:58:13.422989 543705 net.go:698] Add success.
I0319 21:58:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:58:14.455205 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:58:14.455218 543705 disk_worker.go:708] disk space is not compliant
W0319 21:58:14.455221 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:58:14.456592 543705 disk_worker.go:494] system disk:vda1
I0319 21:58:14.456638 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:58:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:58:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:58:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:58:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:58:16.472393 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:58:21.177677 543705 disk_info.go:125] begin check local disk info of client
I0319 21:58:21.180123 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:58:21.180129 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004c22c0 0xc0004c2300]
E0319 21:58:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:58:23.409764 543705 memory.go:184] no items to output this cycle
I0319 21:58:23.409788 543705 cpu.go:275] no items to output this cycle
E0319 21:58:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:58:33.409810 543705 memory.go:184] no items to output this cycle
I0319 21:58:33.409821 543705 cpu.go:275] no items to output this cycle
E0319 21:58:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:58:43.409799 543705 memory.go:191] Add success.
I0319 21:58:43.409825 543705 cpu.go:282] Add success.
I0319 21:58:43.419870 543705 net.go:648] Add success.
I0319 21:58:43.422726 543705 net.go:770] primary dev: ETH0
I0319 21:58:43.422739 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:58:43.422751 543705 net.go:698] Add success.
I0319 21:58:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:58:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:58:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:58:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:58:53.409789 543705 cpu.go:275] no items to output this cycle
I0319 21:58:53.409796 543705 memory.go:184] no items to output this cycle
E0319 21:59:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:59:03.409789 543705 memory.go:184] no items to output this cycle
I0319 21:59:03.409801 543705 cpu.go:275] no items to output this cycle
E0319 21:59:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:59:13.409803 543705 cpu.go:282] Add success.
I0319 21:59:13.409806 543705 memory.go:191] Add success.
W0319 21:59:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 21:59:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 21:59:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 21:59:13.420063 543705 net.go:648] Add success.
I0319 21:59:13.423099 543705 net.go:770] primary dev: ETH0
I0319 21:59:13.423113 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:59:13.423125 543705 net.go:698] Add success.
I0319 21:59:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0319 21:59:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 21:59:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0319 21:59:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0319 21:59:14.456570 543705 disk_worker.go:494] system disk:vda1
I0319 21:59:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 21:59:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 21:59:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:59:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:59:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 21:59:16.472406 543705 disk_local_worker.go:436] Get disk info: []
I0319 21:59:21.181675 543705 disk_info.go:125] begin check local disk info of client
I0319 21:59:21.184147 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 21:59:21.184153 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b2080 0xc0002b20c0]
E0319 21:59:23.410670 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:59:23.410684 543705 memory.go:184] no items to output this cycle
I0319 21:59:23.410688 543705 cpu.go:275] no items to output this cycle
E0319 21:59:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:59:33.409785 543705 memory.go:184] no items to output this cycle
I0319 21:59:33.409799 543705 cpu.go:275] no items to output this cycle
E0319 21:59:43.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:59:43.409834 543705 memory.go:191] Add success.
I0319 21:59:43.409835 543705 cpu.go:282] Add success.
I0319 21:59:43.420377 543705 net.go:648] Add success.
I0319 21:59:43.423197 543705 net.go:770] primary dev: ETH0
I0319 21:59:43.423210 543705 net.go:802] Send network stats successfully!,count is 6
I0319 21:59:43.423223 543705 net.go:698] Add success.
I0319 21:59:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 21:59:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 21:59:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0319 21:59:53.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 21:59:53.409809 543705 memory.go:184] no items to output this cycle
I0319 21:59:53.409812 543705 cpu.go:275] no items to output this cycle
E0319 22:00:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:00:03.409802 543705 memory.go:184] no items to output this cycle
I0319 22:00:03.409800 543705 cpu.go:275] no items to output this cycle
E0319 22:00:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:00:13.409793 543705 memory.go:191] Add success.
I0319 22:00:13.409792 543705 cpu.go:282] Add success.
W0319 22:00:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:00:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:00:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:00:13.420123 543705 net.go:648] Add success.
I0319 22:00:13.422900 543705 net.go:770] primary dev: ETH0
I0319 22:00:13.422913 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:00:13.422925 543705 net.go:698] Add success.
I0319 22:00:13.468966 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f70ce4cf-7bbe-4ed4-af88-2cd2848c57e7","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 22:00:13.468999 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 22:00:14.454955 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:00:14.455143 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:00:14.455153 543705 disk_worker.go:708] disk space is not compliant
W0319 22:00:14.455156 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:00:14.456479 543705 disk_worker.go:494] system disk:vda1
I0319 22:00:14.456526 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:00:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:00:16.457963 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:00:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:00:16.458046 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:00:16.472403 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:00:21.185666 543705 disk_info.go:125] begin check local disk info of client
I0319 22:00:21.188093 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:00:21.188100 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e78c0 0xc0003e7900]
E0319 22:00:23.409853 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:00:23.409870 543705 memory.go:184] no items to output this cycle
I0319 22:00:23.409940 543705 cpu.go:275] no items to output this cycle
E0319 22:00:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:00:33.409780 543705 memory.go:184] no items to output this cycle
I0319 22:00:33.409797 543705 cpu.go:275] no items to output this cycle
I0319 22:00:37.917911 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 22:00:37.917919 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 22:00:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:00:43.410638 543705 memory.go:191] Add success.
I0319 22:00:43.409811 543705 cpu.go:282] Add success.
I0319 22:00:43.420336 543705 net.go:648] Add success.
I0319 22:00:43.422903 543705 net.go:770] primary dev: ETH0
I0319 22:00:43.422917 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:00:43.422929 543705 net.go:698] Add success.
I0319 22:00:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:00:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:00:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:00:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:00:53.409791 543705 memory.go:184] no items to output this cycle
I0319 22:00:53.409794 543705 cpu.go:275] no items to output this cycle
E0319 22:01:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:01:03.409789 543705 memory.go:184] no items to output this cycle
I0319 22:01:03.409792 543705 cpu.go:275] no items to output this cycle
E0319 22:01:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:01:13.409783 543705 memory.go:191] Add success.
I0319 22:01:13.409806 543705 cpu.go:282] Add success.
W0319 22:01:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:01:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:01:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:01:13.420203 543705 net.go:648] Add success.
I0319 22:01:13.423054 543705 net.go:770] primary dev: ETH0
I0319 22:01:13.423068 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:01:13.423080 543705 net.go:698] Add success.
I0319 22:01:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:01:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:01:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0319 22:01:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:01:14.456514 543705 disk_worker.go:494] system disk:vda1
I0319 22:01:14.456562 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:01:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:01:16.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:01:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:01:16.458078 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:01:16.472401 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:01:21.189671 543705 disk_info.go:125] begin check local disk info of client
I0319 22:01:21.192105 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:01:21.192111 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004641c0 0xc000464200]
E0319 22:01:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:01:23.409789 543705 memory.go:184] no items to output this cycle
I0319 22:01:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 22:01:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:01:33.409785 543705 memory.go:184] no items to output this cycle
I0319 22:01:33.409792 543705 cpu.go:275] no items to output this cycle
E0319 22:01:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:01:43.409805 543705 memory.go:191] Add success.
I0319 22:01:43.409806 543705 cpu.go:282] Add success.
I0319 22:01:43.420013 543705 net.go:648] Add success.
I0319 22:01:43.422611 543705 net.go:770] primary dev: ETH0
I0319 22:01:43.422624 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:01:43.422637 543705 net.go:698] Add success.
I0319 22:01:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:01:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:01:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:01:53.410388 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:01:53.410409 543705 memory.go:184] no items to output this cycle
I0319 22:01:53.410423 543705 cpu.go:275] no items to output this cycle
E0319 22:02:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:02:03.409801 543705 memory.go:184] no items to output this cycle
I0319 22:02:03.409813 543705 cpu.go:275] no items to output this cycle
E0319 22:02:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:02:13.409783 543705 memory.go:191] Add success.
W0319 22:02:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 22:02:13.409810 543705 cpu.go:282] Add success.
W0319 22:02:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:02:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:02:13.420071 543705 net.go:648] Add success.
I0319 22:02:13.422940 543705 net.go:770] primary dev: ETH0
I0319 22:02:13.422953 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:02:13.422965 543705 net.go:698] Add success.
W0319 22:02:14.455138 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:02:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0319 22:02:14.455204 543705 disk_worker.go:728] disk inode is not compliant
E0319 22:02:14.456816 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 22:02:14.456825 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 22:02:14.456830 543705 custom_config.go:64] query custom config with name: gpu
I0319 22:02:14.456874 543705 disk_worker.go:494] system disk:vda1
I0319 22:02:14.456915 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 22:02:15.456790 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 22:02:15.456799 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:02:16.457921 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 22:02:16.457920 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 22:02:16.457975 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:02:16.457997 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:02:16.472327 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:02:21.193673 543705 disk_info.go:125] begin check local disk info of client
I0319 22:02:21.196061 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:02:21.196068 543705 disk_info.go:196] parse disk info done, disk is : [0xc000305440 0xc000305480]
E0319 22:02:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:02:23.409789 543705 memory.go:184] no items to output this cycle
I0319 22:02:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 22:02:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:02:33.409811 543705 memory.go:184] no items to output this cycle
I0319 22:02:33.409824 543705 cpu.go:275] no items to output this cycle
E0319 22:02:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:02:43.409796 543705 memory.go:191] Add success.
I0319 22:02:43.409823 543705 cpu.go:282] Add success.
I0319 22:02:43.419791 543705 net.go:770] primary dev: ETH0
I0319 22:02:43.419803 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:02:43.419816 543705 net.go:698] Add success.
I0319 22:02:43.420063 543705 net.go:648] Add success.
I0319 22:02:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:02:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:02:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:02:53.410241 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:02:53.410258 543705 memory.go:184] no items to output this cycle
I0319 22:02:53.410284 543705 cpu.go:275] no items to output this cycle
E0319 22:03:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:03:03.409790 543705 memory.go:184] no items to output this cycle
I0319 22:03:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 22:03:13.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:03:13.409772 543705 memory.go:191] Add success.
W0319 22:03:13.409800 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 22:03:13.409804 543705 cpu.go:282] Add success.
W0319 22:03:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:03:13.409814 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:03:13.420157 543705 net.go:648] Add success.
I0319 22:03:13.423095 543705 net.go:770] primary dev: ETH0
I0319 22:03:13.423122 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:03:13.423135 543705 net.go:698] Add success.
I0319 22:03:13.547115 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0fea969e-bada-4e2f-830f-085d2ec679c4","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 22:03:13.547148 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 22:03:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:03:14.455138 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:03:14.455216 543705 disk_worker.go:708] disk space is not compliant
W0319 22:03:14.455219 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:03:14.456614 543705 disk_worker.go:494] system disk:vda1
I0319 22:03:14.456644 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:03:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:03:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:03:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:03:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:03:16.472379 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:03:21.197676 543705 disk_info.go:125] begin check local disk info of client
I0319 22:03:21.200191 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:03:21.200197 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000e1240 0xc0000e1280]
E0319 22:03:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:03:23.409777 543705 memory.go:184] no items to output this cycle
I0319 22:03:23.409784 543705 cpu.go:275] no items to output this cycle
E0319 22:03:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:03:33.409810 543705 memory.go:184] no items to output this cycle
I0319 22:03:33.409821 543705 cpu.go:275] no items to output this cycle
I0319 22:03:37.918063 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 22:03:37.918071 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 22:03:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:03:43.410580 543705 memory.go:191] Add success.
I0319 22:03:43.409806 543705 cpu.go:282] Add success.
I0319 22:03:43.420341 543705 net.go:648] Add success.
I0319 22:03:43.423258 543705 net.go:770] primary dev: ETH0
I0319 22:03:43.423271 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:03:43.423284 543705 net.go:698] Add success.
I0319 22:03:46.457995 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:03:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:03:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:03:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:03:53.409780 543705 memory.go:184] no items to output this cycle
I0319 22:03:53.409783 543705 cpu.go:275] no items to output this cycle
E0319 22:04:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:04:03.409791 543705 memory.go:184] no items to output this cycle
I0319 22:04:03.409791 543705 cpu.go:275] no items to output this cycle
E0319 22:04:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:04:13.409786 543705 memory.go:191] Add success.
I0319 22:04:13.409806 543705 cpu.go:282] Add success.
W0319 22:04:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:04:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:04:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:04:13.420099 543705 net.go:648] Add success.
I0319 22:04:13.422644 543705 net.go:770] primary dev: ETH0
I0319 22:04:13.422657 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:04:13.422669 543705 net.go:698] Add success.
I0319 22:04:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:04:14.455086 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:04:14.455148 543705 disk_worker.go:708] disk space is not compliant
W0319 22:04:14.455151 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:04:14.456482 543705 disk_worker.go:494] system disk:vda1
I0319 22:04:14.456527 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:04:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:04:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:04:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:04:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:04:16.472422 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:04:21.201673 543705 disk_info.go:125] begin check local disk info of client
I0319 22:04:21.204137 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:04:21.204144 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003900c0 0xc000390100]
E0319 22:04:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:04:23.409761 543705 memory.go:184] no items to output this cycle
I0319 22:04:23.409796 543705 cpu.go:275] no items to output this cycle
E0319 22:04:33.409877 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:04:33.409896 543705 memory.go:184] no items to output this cycle
I0319 22:04:33.409979 543705 cpu.go:275] no items to output this cycle
E0319 22:04:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:04:43.409788 543705 memory.go:191] Add success.
I0319 22:04:43.409823 543705 cpu.go:282] Add success.
I0319 22:04:43.419894 543705 net.go:648] Add success.
I0319 22:04:43.422461 543705 net.go:770] primary dev: ETH0
I0319 22:04:43.422475 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:04:43.422487 543705 net.go:698] Add success.
I0319 22:04:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:04:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:04:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:04:53.410365 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:04:53.410383 543705 memory.go:184] no items to output this cycle
I0319 22:04:53.410394 543705 cpu.go:275] no items to output this cycle
E0319 22:05:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:05:03.409789 543705 memory.go:184] no items to output this cycle
I0319 22:05:03.409796 543705 cpu.go:275] no items to output this cycle
E0319 22:05:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:05:13.409786 543705 memory.go:191] Add success.
I0319 22:05:13.409788 543705 cpu.go:282] Add success.
W0319 22:05:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:05:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:05:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:05:13.420189 543705 net.go:648] Add success.
I0319 22:05:13.423077 543705 net.go:770] primary dev: ETH0
I0319 22:05:13.423090 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:05:13.423103 543705 net.go:698] Add success.
I0319 22:05:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:05:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:05:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0319 22:05:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:05:14.456496 543705 disk_worker.go:494] system disk:vda1
I0319 22:05:14.456541 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:05:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:05:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:05:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:05:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:05:16.472402 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:05:21.205725 543705 disk_info.go:125] begin check local disk info of client
I0319 22:05:21.208214 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:05:21.208221 543705 disk_info.go:196] parse disk info done, disk is : [0xc000291840 0xc000291880]
E0319 22:05:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:05:23.409794 543705 memory.go:184] no items to output this cycle
I0319 22:05:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 22:05:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:05:33.409782 543705 memory.go:184] no items to output this cycle
I0319 22:05:33.409785 543705 cpu.go:275] no items to output this cycle
E0319 22:05:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:05:43.409827 543705 memory.go:191] Add success.
I0319 22:05:43.409835 543705 cpu.go:282] Add success.
I0319 22:05:43.419996 543705 net.go:648] Add success.
I0319 22:05:43.422567 543705 net.go:770] primary dev: ETH0
I0319 22:05:43.422581 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:05:43.422593 543705 net.go:698] Add success.
I0319 22:05:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:05:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:05:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:05:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:05:53.409767 543705 memory.go:184] no items to output this cycle
I0319 22:05:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 22:06:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:06:03.409804 543705 memory.go:184] no items to output this cycle
I0319 22:06:03.409816 543705 cpu.go:275] no items to output this cycle
E0319 22:06:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:06:13.409793 543705 memory.go:191] Add success.
I0319 22:06:13.409795 543705 cpu.go:282] Add success.
W0319 22:06:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:06:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:06:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:06:13.420236 543705 net.go:648] Add success.
I0319 22:06:13.423111 543705 net.go:770] primary dev: ETH0
I0319 22:06:13.423124 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:06:13.423136 543705 net.go:698] Add success.
I0319 22:06:13.475982 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"395c5114-4cb1-4fd6-b5e0-75054afe29c2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 22:06:13.476015 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 22:06:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:06:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:06:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0319 22:06:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:06:14.456532 543705 disk_worker.go:494] system disk:vda1
I0319 22:06:14.456587 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:06:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:06:16.457570 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:06:16.457635 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:06:16.457685 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:06:16.473026 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:06:21.209675 543705 disk_info.go:125] begin check local disk info of client
I0319 22:06:21.212147 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:06:21.212153 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002462c0 0xc000246300]
E0319 22:06:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:06:23.409765 543705 memory.go:184] no items to output this cycle
I0319 22:06:23.409774 543705 cpu.go:275] no items to output this cycle
E0319 22:06:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:06:33.409765 543705 memory.go:184] no items to output this cycle
I0319 22:06:33.409794 543705 cpu.go:275] no items to output this cycle
I0319 22:06:37.920303 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 22:06:37.920311 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 22:06:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:06:43.410730 543705 memory.go:191] Add success.
I0319 22:06:43.409814 543705 cpu.go:282] Add success.
I0319 22:06:43.420441 543705 net.go:648] Add success.
I0319 22:06:43.423117 543705 net.go:770] primary dev: ETH0
I0319 22:06:43.423130 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:06:43.423143 543705 net.go:698] Add success.
I0319 22:06:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:06:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:06:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:06:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:06:53.409769 543705 memory.go:184] no items to output this cycle
I0319 22:06:53.409802 543705 cpu.go:275] no items to output this cycle
E0319 22:07:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:07:03.409789 543705 memory.go:184] no items to output this cycle
I0319 22:07:03.409788 543705 cpu.go:275] no items to output this cycle
E0319 22:07:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:07:13.409780 543705 memory.go:191] Add success.
W0319 22:07:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 22:07:13.409810 543705 cpu.go:282] Add success.
W0319 22:07:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:07:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:07:13.420070 543705 net.go:648] Add success.
I0319 22:07:13.422595 543705 net.go:770] primary dev: ETH0
I0319 22:07:13.422608 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:07:13.422620 543705 net.go:698] Add success.
I0319 22:07:13.453169 543705 event_worker.go:152] Polling the log file for events...
W0319 22:07:14.455458 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:07:14.455472 543705 disk_worker.go:708] disk space is not compliant
W0319 22:07:14.455476 543705 disk_worker.go:728] disk inode is not compliant
E0319 22:07:14.456845 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 22:07:14.456854 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 22:07:14.456860 543705 custom_config.go:64] query custom config with name: gpu
I0319 22:07:14.457725 543705 disk_worker.go:494] system disk:vda1
I0319 22:07:14.457764 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 22:07:15.456818 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 22:07:15.456827 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:07:16.457915 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 22:07:16.457915 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 22:07:16.457968 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:07:16.457988 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:07:16.472331 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:07:21.213674 543705 disk_info.go:125] begin check local disk info of client
I0319 22:07:21.216047 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:07:21.216052 543705 disk_info.go:196] parse disk info done, disk is : [0xc000480300 0xc000480340]
E0319 22:07:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:07:23.409762 543705 memory.go:184] no items to output this cycle
I0319 22:07:23.409792 543705 cpu.go:275] no items to output this cycle
E0319 22:07:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:07:33.409795 543705 memory.go:184] no items to output this cycle
I0319 22:07:33.409808 543705 cpu.go:275] no items to output this cycle
E0319 22:07:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:07:43.409794 543705 memory.go:191] Add success.
I0319 22:07:43.409810 543705 cpu.go:282] Add success.
I0319 22:07:43.419897 543705 net.go:648] Add success.
I0319 22:07:43.422843 543705 net.go:770] primary dev: ETH0
I0319 22:07:43.422857 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:07:43.422870 543705 net.go:698] Add success.
I0319 22:07:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:07:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:07:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:07:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:07:53.409801 543705 memory.go:184] no items to output this cycle
I0319 22:07:53.409808 543705 cpu.go:275] no items to output this cycle
E0319 22:08:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:08:03.409784 543705 memory.go:184] no items to output this cycle
I0319 22:08:03.409787 543705 cpu.go:275] no items to output this cycle
E0319 22:08:13.409855 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:08:13.409882 543705 memory.go:191] Add success.
W0319 22:08:13.409914 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:08:13.409927 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:08:13.409930 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:08:13.409958 543705 cpu.go:282] Add success.
I0319 22:08:13.419758 543705 net.go:648] Add success.
I0319 22:08:13.422368 543705 net.go:770] primary dev: ETH0
I0319 22:08:13.422382 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:08:13.422397 543705 net.go:698] Add success.
I0319 22:08:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:08:14.455103 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:08:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0319 22:08:14.455170 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:08:14.456469 543705 disk_worker.go:494] system disk:vda1
I0319 22:08:14.456511 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:08:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:08:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:08:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:08:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:08:16.472366 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:08:21.217675 543705 disk_info.go:125] begin check local disk info of client
I0319 22:08:21.220095 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:08:21.220101 543705 disk_info.go:196] parse disk info done, disk is : [0xc00060b900 0xc00060b940]
E0319 22:08:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:08:23.409804 543705 memory.go:184] no items to output this cycle
I0319 22:08:23.409820 543705 cpu.go:275] no items to output this cycle
E0319 22:08:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:08:33.409768 543705 memory.go:184] no items to output this cycle
I0319 22:08:33.409794 543705 cpu.go:275] no items to output this cycle
E0319 22:08:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:08:43.409799 543705 memory.go:191] Add success.
I0319 22:08:43.409817 543705 cpu.go:282] Add success.
I0319 22:08:43.419877 543705 net.go:648] Add success.
I0319 22:08:43.423031 543705 net.go:770] primary dev: ETH0
I0319 22:08:43.423045 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:08:43.423229 543705 net.go:698] Add success.
I0319 22:08:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:08:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:08:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:08:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:08:53.409786 543705 memory.go:184] no items to output this cycle
I0319 22:08:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 22:09:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:09:03.409813 543705 memory.go:184] no items to output this cycle
I0319 22:09:03.409825 543705 cpu.go:275] no items to output this cycle
E0319 22:09:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:09:13.409804 543705 memory.go:191] Add success.
I0319 22:09:13.409806 543705 cpu.go:282] Add success.
W0319 22:09:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:09:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:09:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:09:13.420239 543705 net.go:648] Add success.
I0319 22:09:13.422983 543705 net.go:770] primary dev: ETH0
I0319 22:09:13.422996 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:09:13.423008 543705 net.go:698] Add success.
I0319 22:09:13.469679 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5470ea6e-9c1a-4086-bae2-4c1a781175b3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 22:09:13.469721 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 22:09:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:09:14.455091 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:09:14.455153 543705 disk_worker.go:708] disk space is not compliant
W0319 22:09:14.455156 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:09:14.456487 543705 disk_worker.go:494] system disk:vda1
I0319 22:09:14.456532 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:09:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:09:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:09:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:09:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:09:16.472487 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:09:21.221674 543705 disk_info.go:125] begin check local disk info of client
I0319 22:09:21.224102 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:09:21.224108 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5540 0xc0000c5580]
E0319 22:09:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:09:23.409770 543705 memory.go:184] no items to output this cycle
I0319 22:09:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 22:09:33.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:09:33.409809 543705 memory.go:184] no items to output this cycle
I0319 22:09:33.409823 543705 cpu.go:275] no items to output this cycle
I0319 22:09:37.921743 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 22:09:37.921758 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 22:09:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:09:43.410661 543705 memory.go:191] Add success.
I0319 22:09:43.409826 543705 cpu.go:282] Add success.
I0319 22:09:43.420373 543705 net.go:648] Add success.
I0319 22:09:43.422943 543705 net.go:770] primary dev: ETH0
I0319 22:09:43.422957 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:09:43.422970 543705 net.go:698] Add success.
I0319 22:09:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:09:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:09:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:09:53.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:09:53.409815 543705 memory.go:184] no items to output this cycle
I0319 22:09:53.409821 543705 cpu.go:275] no items to output this cycle
E0319 22:10:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:10:03.409779 543705 memory.go:184] no items to output this cycle
I0319 22:10:03.409817 543705 cpu.go:275] no items to output this cycle
E0319 22:10:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:10:13.409786 543705 memory.go:191] Add success.
I0319 22:10:13.409816 543705 cpu.go:282] Add success.
W0319 22:10:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:10:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:10:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:10:13.420084 543705 net.go:648] Add success.
I0319 22:10:13.422771 543705 net.go:770] primary dev: ETH0
I0319 22:10:13.422785 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:10:13.422801 543705 net.go:698] Add success.
I0319 22:10:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:10:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:10:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0319 22:10:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:10:14.456562 543705 disk_worker.go:494] system disk:vda1
I0319 22:10:14.456592 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:10:15.456013 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:10:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:10:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:10:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:10:16.472386 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:10:21.225674 543705 disk_info.go:125] begin check local disk info of client
I0319 22:10:21.228137 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:10:21.228145 543705 disk_info.go:196] parse disk info done, disk is : [0xc000328000 0xc000328040]
E0319 22:10:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:10:23.409765 543705 memory.go:184] no items to output this cycle
I0319 22:10:23.409794 543705 cpu.go:275] no items to output this cycle
E0319 22:10:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:10:33.409800 543705 memory.go:184] no items to output this cycle
I0319 22:10:33.409816 543705 cpu.go:275] no items to output this cycle
E0319 22:10:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:10:43.409797 543705 memory.go:191] Add success.
I0319 22:10:43.409798 543705 cpu.go:282] Add success.
I0319 22:10:43.419945 543705 net.go:648] Add success.
I0319 22:10:43.422572 543705 net.go:770] primary dev: ETH0
I0319 22:10:43.422587 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:10:43.422602 543705 net.go:698] Add success.
I0319 22:10:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:10:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:10:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:10:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:10:53.409778 543705 memory.go:184] no items to output this cycle
I0319 22:10:53.409783 543705 cpu.go:275] no items to output this cycle
E0319 22:11:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:11:03.409775 543705 memory.go:184] no items to output this cycle
I0319 22:11:03.409804 543705 cpu.go:275] no items to output this cycle
E0319 22:11:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:11:13.409798 543705 memory.go:191] Add success.
I0319 22:11:13.409798 543705 cpu.go:282] Add success.
W0319 22:11:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:11:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:11:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:11:13.420114 543705 net.go:648] Add success.
I0319 22:11:13.422742 543705 net.go:770] primary dev: ETH0
I0319 22:11:13.422757 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:11:13.423003 543705 net.go:698] Add success.
I0319 22:11:14.454950 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:11:14.455139 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:11:14.455151 543705 disk_worker.go:708] disk space is not compliant
W0319 22:11:14.455153 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:11:14.456450 543705 disk_worker.go:494] system disk:vda1
I0319 22:11:14.456491 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:11:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:11:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:11:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:11:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:11:16.472393 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:11:21.229677 543705 disk_info.go:125] begin check local disk info of client
I0319 22:11:21.232062 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:11:21.232068 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4d80 0xc0000c4dc0]
E0319 22:11:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:11:23.409787 543705 memory.go:184] no items to output this cycle
I0319 22:11:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 22:11:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:11:33.409766 543705 memory.go:184] no items to output this cycle
I0319 22:11:33.409803 543705 cpu.go:275] no items to output this cycle
E0319 22:11:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:11:43.409792 543705 memory.go:191] Add success.
I0319 22:11:43.409808 543705 cpu.go:282] Add success.
I0319 22:11:43.420074 543705 net.go:648] Add success.
I0319 22:11:43.422899 543705 net.go:770] primary dev: ETH0
I0319 22:11:43.422914 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:11:43.422930 543705 net.go:698] Add success.
I0319 22:11:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:11:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:11:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:11:53.410210 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:11:53.410228 543705 memory.go:184] no items to output this cycle
I0319 22:11:53.410249 543705 cpu.go:275] no items to output this cycle
E0319 22:12:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:12:03.409777 543705 memory.go:184] no items to output this cycle
I0319 22:12:03.409868 543705 cpu.go:275] no items to output this cycle
E0319 22:12:13.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:12:13.409775 543705 memory.go:191] Add success.
W0319 22:12:13.409801 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 22:12:13.409809 543705 cpu.go:282] Add success.
W0319 22:12:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:12:13.409816 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:12:13.420121 543705 net.go:648] Add success.
I0319 22:12:13.423181 543705 net.go:770] primary dev: ETH0
I0319 22:12:13.423194 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:12:13.423206 543705 net.go:698] Add success.
I0319 22:12:13.469096 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f7de3e23-c29d-439f-bb2d-815f1d7d589e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 22:12:13.469128 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 22:12:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:12:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0319 22:12:14.455200 543705 disk_worker.go:728] disk inode is not compliant
E0319 22:12:14.455883 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 22:12:14.455892 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 22:12:14.455897 543705 custom_config.go:64] query custom config with name: gpu
I0319 22:12:14.456650 543705 disk_worker.go:494] system disk:vda1
I0319 22:12:14.456695 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 22:12:15.456836 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 22:12:15.456843 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:12:16.457955 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 22:12:16.457965 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 22:12:16.458006 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:12:16.458030 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:12:16.472367 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:12:21.233674 543705 disk_info.go:125] begin check local disk info of client
I0319 22:12:21.236157 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:12:21.236164 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002af8c0 0xc0002af900]
E0319 22:12:23.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:12:23.409757 543705 memory.go:184] no items to output this cycle
I0319 22:12:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 22:12:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:12:33.409766 543705 memory.go:184] no items to output this cycle
I0319 22:12:33.409811 543705 cpu.go:275] no items to output this cycle
I0319 22:12:37.921898 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 22:12:37.921913 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 22:12:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:12:43.410787 543705 memory.go:191] Add success.
I0319 22:12:43.409810 543705 cpu.go:282] Add success.
I0319 22:12:43.420487 543705 net.go:648] Add success.
I0319 22:12:43.423250 543705 net.go:770] primary dev: ETH0
I0319 22:12:43.423263 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:12:43.423276 543705 net.go:698] Add success.
I0319 22:12:46.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:12:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:12:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:12:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:12:53.409774 543705 memory.go:184] no items to output this cycle
I0319 22:12:53.409777 543705 cpu.go:275] no items to output this cycle
E0319 22:13:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:13:03.409774 543705 memory.go:184] no items to output this cycle
I0319 22:13:03.409794 543705 cpu.go:275] no items to output this cycle
E0319 22:13:13.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:13:13.409774 543705 memory.go:191] Add success.
W0319 22:13:13.409801 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:13:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:13:13.409815 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:13:13.409834 543705 cpu.go:282] Add success.
I0319 22:13:13.420057 543705 net.go:648] Add success.
I0319 22:13:13.422980 543705 net.go:770] primary dev: ETH0
I0319 22:13:13.422996 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:13:13.423222 543705 net.go:698] Add success.
I0319 22:13:14.454944 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:13:14.455201 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:13:14.455213 543705 disk_worker.go:708] disk space is not compliant
W0319 22:13:14.455216 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:13:14.456603 543705 disk_worker.go:494] system disk:vda1
I0319 22:13:14.456631 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:13:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:13:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:13:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:13:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:13:16.472385 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:13:21.237680 543705 disk_info.go:125] begin check local disk info of client
I0319 22:13:21.240231 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:13:21.240238 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002aeb40 0xc0002aeb80]
E0319 22:13:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:13:23.409762 543705 memory.go:184] no items to output this cycle
I0319 22:13:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 22:13:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:13:33.409771 543705 memory.go:184] no items to output this cycle
I0319 22:13:33.409795 543705 cpu.go:275] no items to output this cycle
E0319 22:13:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:13:43.409798 543705 memory.go:191] Add success.
I0319 22:13:43.409799 543705 cpu.go:282] Add success.
I0319 22:13:43.419969 543705 net.go:648] Add success.
I0319 22:13:43.422806 543705 net.go:770] primary dev: ETH0
I0319 22:13:43.422819 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:13:43.422832 543705 net.go:698] Add success.
I0319 22:13:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:13:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:13:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:13:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:13:53.409795 543705 memory.go:184] no items to output this cycle
I0319 22:13:53.409804 543705 cpu.go:275] no items to output this cycle
I0319 22:14:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 22:14:03.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:14:03.409818 543705 memory.go:184] no items to output this cycle
E0319 22:14:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:14:13.409821 543705 memory.go:191] Add success.
I0319 22:14:13.409820 543705 cpu.go:282] Add success.
W0319 22:14:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:14:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:14:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:14:13.420424 543705 net.go:648] Add success.
I0319 22:14:13.423452 543705 net.go:770] primary dev: ETH0
I0319 22:14:13.423466 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:14:13.423480 543705 net.go:698] Add success.
I0319 22:14:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:14:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:14:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0319 22:14:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:14:14.456512 543705 disk_worker.go:494] system disk:vda1
I0319 22:14:14.456538 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:14:15.456012 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:14:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:14:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:14:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:14:16.472400 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:14:21.241674 543705 disk_info.go:125] begin check local disk info of client
I0319 22:14:21.244113 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:14:21.244119 543705 disk_info.go:196] parse disk info done, disk is : [0xc000512680 0xc0005126c0]
E0319 22:14:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:14:23.409799 543705 memory.go:184] no items to output this cycle
I0319 22:14:23.409813 543705 cpu.go:275] no items to output this cycle
E0319 22:14:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:14:33.409774 543705 memory.go:184] no items to output this cycle
I0319 22:14:33.409782 543705 cpu.go:275] no items to output this cycle
E0319 22:14:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:14:43.409802 543705 memory.go:191] Add success.
I0319 22:14:43.409803 543705 cpu.go:282] Add success.
I0319 22:14:43.419905 543705 net.go:648] Add success.
I0319 22:14:43.422588 543705 net.go:770] primary dev: ETH0
I0319 22:14:43.422604 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:14:43.422618 543705 net.go:698] Add success.
I0319 22:14:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:14:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:14:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:14:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:14:53.409799 543705 memory.go:184] no items to output this cycle
I0319 22:14:53.409811 543705 cpu.go:275] no items to output this cycle
E0319 22:15:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:15:03.409776 543705 memory.go:184] no items to output this cycle
I0319 22:15:03.409805 543705 cpu.go:275] no items to output this cycle
E0319 22:15:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:15:13.409773 543705 memory.go:191] Add success.
W0319 22:15:13.409801 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 22:15:13.409807 543705 cpu.go:282] Add success.
W0319 22:15:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:15:13.409816 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:15:13.420250 543705 net.go:648] Add success.
I0319 22:15:13.422986 543705 net.go:770] primary dev: ETH0
I0319 22:15:13.423004 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:15:13.423019 543705 net.go:698] Add success.
I0319 22:15:13.468922 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"cddf8d00-985f-4e45-b1fc-e11492ac55f8","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 22:15:13.468953 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 22:15:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:15:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:15:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0319 22:15:14.455217 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:15:14.457334 543705 disk_worker.go:494] system disk:vda1
I0319 22:15:14.457444 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:15:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:15:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:15:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:15:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:15:16.472413 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:15:21.245675 543705 disk_info.go:125] begin check local disk info of client
I0319 22:15:21.248100 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:15:21.248107 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a0e00 0xc0002a0e40]
E0319 22:15:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:15:23.409793 543705 memory.go:184] no items to output this cycle
I0319 22:15:23.409806 543705 cpu.go:275] no items to output this cycle
E0319 22:15:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:15:33.409764 543705 memory.go:184] no items to output this cycle
I0319 22:15:33.409806 543705 cpu.go:275] no items to output this cycle
I0319 22:15:37.924318 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 22:15:37.924325 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 22:15:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:15:43.410702 543705 memory.go:191] Add success.
I0319 22:15:43.409805 543705 cpu.go:282] Add success.
I0319 22:15:43.420459 543705 net.go:648] Add success.
I0319 22:15:43.423321 543705 net.go:770] primary dev: ETH0
I0319 22:15:43.423333 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:15:43.423348 543705 net.go:698] Add success.
I0319 22:15:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:15:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:15:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:15:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:15:53.409771 543705 memory.go:184] no items to output this cycle
I0319 22:15:53.409794 543705 cpu.go:275] no items to output this cycle
E0319 22:16:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:16:03.409801 543705 memory.go:184] no items to output this cycle
I0319 22:16:03.409814 543705 cpu.go:275] no items to output this cycle
W0319 22:16:13.409707 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:16:13.409728 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:16:13.409734 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:16:13.409794 543705 cpu.go:282] Add success.
E0319 22:16:13.409833 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:16:13.409855 543705 memory.go:191] Add success.
I0319 22:16:13.420222 543705 net.go:648] Add success.
I0319 22:16:13.422974 543705 net.go:770] primary dev: ETH0
I0319 22:16:13.422987 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:16:13.422998 543705 net.go:698] Add success.
I0319 22:16:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:16:14.455153 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:16:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0319 22:16:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:16:14.456726 543705 disk_worker.go:494] system disk:vda1
I0319 22:16:14.456754 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:16:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:16:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:16:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:16:16.458076 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:16:16.472400 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:16:21.249676 543705 disk_info.go:125] begin check local disk info of client
I0319 22:16:21.252069 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:16:21.252076 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ad040 0xc0003ad080]
E0319 22:16:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:16:23.409787 543705 memory.go:184] no items to output this cycle
I0319 22:16:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 22:16:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:16:33.409780 543705 memory.go:184] no items to output this cycle
I0319 22:16:33.409796 543705 cpu.go:275] no items to output this cycle
E0319 22:16:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:16:43.409819 543705 memory.go:191] Add success.
I0319 22:16:43.409828 543705 cpu.go:282] Add success.
I0319 22:16:43.420015 543705 net.go:648] Add success.
I0319 22:16:43.422825 543705 net.go:770] primary dev: ETH0
I0319 22:16:43.422840 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:16:43.422855 543705 net.go:698] Add success.
I0319 22:16:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:16:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:16:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:16:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:16:53.409779 543705 memory.go:184] no items to output this cycle
I0319 22:16:53.409781 543705 cpu.go:275] no items to output this cycle
E0319 22:17:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:17:03.409778 543705 memory.go:184] no items to output this cycle
I0319 22:17:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 22:17:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:17:13.409787 543705 memory.go:191] Add success.
I0319 22:17:13.409807 543705 cpu.go:282] Add success.
W0319 22:17:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:17:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:17:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:17:13.420121 543705 net.go:648] Add success.
I0319 22:17:13.422865 543705 net.go:770] primary dev: ETH0
I0319 22:17:13.422880 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:17:13.422892 543705 net.go:698] Add success.
I0319 22:17:13.453444 543705 event_worker.go:152] Polling the log file for events...
W0319 22:17:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:17:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0319 22:17:14.455197 543705 disk_worker.go:728] disk inode is not compliant
E0319 22:17:14.455883 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 22:17:14.455892 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 22:17:14.455899 543705 custom_config.go:64] query custom config with name: gpu
I0319 22:17:14.456635 543705 disk_worker.go:494] system disk:vda1
I0319 22:17:14.456677 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 22:17:15.456815 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 22:17:15.456822 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:17:16.457939 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 22:17:16.457939 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 22:17:16.457992 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:17:16.458012 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:17:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:17:21.253672 543705 disk_info.go:125] begin check local disk info of client
I0319 22:17:21.256041 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:17:21.256047 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048a2c0 0xc00048a300]
E0319 22:17:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:17:23.409767 543705 memory.go:184] no items to output this cycle
I0319 22:17:23.409773 543705 cpu.go:275] no items to output this cycle
E0319 22:17:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:17:33.409803 543705 memory.go:184] no items to output this cycle
I0319 22:17:33.409814 543705 cpu.go:275] no items to output this cycle
E0319 22:17:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:17:43.409783 543705 memory.go:191] Add success.
I0319 22:17:43.409813 543705 cpu.go:282] Add success.
I0319 22:17:43.420001 543705 net.go:648] Add success.
I0319 22:17:43.422781 543705 net.go:770] primary dev: ETH0
I0319 22:17:43.422793 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:17:43.422806 543705 net.go:698] Add success.
I0319 22:17:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:17:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:17:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:17:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:17:53.409767 543705 memory.go:184] no items to output this cycle
I0319 22:17:53.409791 543705 cpu.go:275] no items to output this cycle
E0319 22:18:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:18:03.409798 543705 memory.go:184] no items to output this cycle
I0319 22:18:03.409814 543705 cpu.go:275] no items to output this cycle
E0319 22:18:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:18:13.409802 543705 memory.go:191] Add success.
I0319 22:18:13.409801 543705 cpu.go:282] Add success.
W0319 22:18:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:18:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:18:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:18:13.420070 543705 net.go:648] Add success.
I0319 22:18:13.422880 543705 net.go:770] primary dev: ETH0
I0319 22:18:13.422893 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:18:13.422905 543705 net.go:698] Add success.
I0319 22:18:13.468466 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8e79fa02-0efa-4319-b93c-e8bc3b346d23","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 22:18:13.468499 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 22:18:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:18:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:18:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0319 22:18:14.455171 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:18:14.456541 543705 disk_worker.go:494] system disk:vda1
I0319 22:18:14.456583 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:18:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:18:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:18:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:18:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:18:16.472413 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:18:21.257675 543705 disk_info.go:125] begin check local disk info of client
I0319 22:18:21.260158 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:18:21.260164 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002502c0 0xc000250300]
E0319 22:18:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:18:23.409773 543705 memory.go:184] no items to output this cycle
I0319 22:18:23.409794 543705 cpu.go:275] no items to output this cycle
E0319 22:18:33.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:18:33.409816 543705 memory.go:184] no items to output this cycle
I0319 22:18:33.409832 543705 cpu.go:275] no items to output this cycle
I0319 22:18:37.925745 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 22:18:37.925753 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 22:18:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:18:43.410666 543705 memory.go:191] Add success.
I0319 22:18:43.409809 543705 cpu.go:282] Add success.
I0319 22:18:43.420443 543705 net.go:648] Add success.
I0319 22:18:43.423086 543705 net.go:770] primary dev: ETH0
I0319 22:18:43.423100 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:18:43.423115 543705 net.go:698] Add success.
I0319 22:18:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:18:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:18:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:18:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:18:53.409802 543705 memory.go:184] no items to output this cycle
I0319 22:18:53.409815 543705 cpu.go:275] no items to output this cycle
E0319 22:19:03.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:19:03.409815 543705 memory.go:184] no items to output this cycle
I0319 22:19:03.409829 543705 cpu.go:275] no items to output this cycle
E0319 22:19:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:19:13.409798 543705 memory.go:191] Add success.
I0319 22:19:13.409818 543705 cpu.go:282] Add success.
W0319 22:19:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:19:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:19:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:19:13.420043 543705 net.go:648] Add success.
I0319 22:19:13.422866 543705 net.go:770] primary dev: ETH0
I0319 22:19:13.422878 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:19:13.422891 543705 net.go:698] Add success.
I0319 22:19:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:19:14.455102 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:19:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0319 22:19:14.455165 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:19:14.456485 543705 disk_worker.go:494] system disk:vda1
I0319 22:19:14.456526 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:19:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:19:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:19:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:19:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:19:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:19:21.261673 543705 disk_info.go:125] begin check local disk info of client
I0319 22:19:21.264109 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:19:21.264115 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003042c0 0xc000304300]
E0319 22:19:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:19:23.409785 543705 memory.go:184] no items to output this cycle
I0319 22:19:23.409806 543705 cpu.go:275] no items to output this cycle
E0319 22:19:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:19:33.409784 543705 cpu.go:275] no items to output this cycle
I0319 22:19:33.409791 543705 memory.go:184] no items to output this cycle
E0319 22:19:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:19:43.409817 543705 memory.go:191] Add success.
I0319 22:19:43.409838 543705 cpu.go:282] Add success.
I0319 22:19:43.419992 543705 net.go:648] Add success.
I0319 22:19:43.422798 543705 net.go:770] primary dev: ETH0
I0319 22:19:43.422810 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:19:43.422834 543705 net.go:698] Add success.
I0319 22:19:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:19:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:19:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:19:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:19:53.409795 543705 memory.go:184] no items to output this cycle
I0319 22:19:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 22:20:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:20:03.409792 543705 memory.go:184] no items to output this cycle
I0319 22:20:03.409807 543705 cpu.go:275] no items to output this cycle
E0319 22:20:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:20:13.409790 543705 memory.go:191] Add success.
I0319 22:20:13.409809 543705 cpu.go:282] Add success.
W0319 22:20:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:20:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:20:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:20:13.420162 543705 net.go:648] Add success.
I0319 22:20:13.422928 543705 net.go:770] primary dev: ETH0
I0319 22:20:13.422942 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:20:13.422956 543705 net.go:698] Add success.
I0319 22:20:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:20:14.455098 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:20:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0319 22:20:14.455179 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:20:14.456579 543705 disk_worker.go:494] system disk:vda1
I0319 22:20:14.456608 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:20:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:20:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:20:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:20:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:20:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:20:21.265673 543705 disk_info.go:125] begin check local disk info of client
I0319 22:20:21.268120 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:20:21.268127 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003749c0 0xc000374a00]
E0319 22:20:23.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:20:23.409760 543705 memory.go:184] no items to output this cycle
I0319 22:20:23.409796 543705 cpu.go:275] no items to output this cycle
E0319 22:20:33.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:20:33.409816 543705 memory.go:184] no items to output this cycle
I0319 22:20:33.409825 543705 cpu.go:275] no items to output this cycle
E0319 22:20:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:20:43.409787 543705 memory.go:191] Add success.
I0319 22:20:43.409845 543705 cpu.go:282] Add success.
I0319 22:20:43.420154 543705 net.go:648] Add success.
I0319 22:20:43.422996 543705 net.go:770] primary dev: ETH0
I0319 22:20:43.423010 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:20:43.423023 543705 net.go:698] Add success.
I0319 22:20:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:20:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:20:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:20:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:20:53.409779 543705 cpu.go:275] no items to output this cycle
I0319 22:20:53.409781 543705 memory.go:184] no items to output this cycle
E0319 22:21:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:21:03.409787 543705 memory.go:184] no items to output this cycle
I0319 22:21:03.409790 543705 cpu.go:275] no items to output this cycle
E0319 22:21:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:21:13.409800 543705 memory.go:191] Add success.
I0319 22:21:13.409801 543705 cpu.go:282] Add success.
W0319 22:21:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:21:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:21:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:21:13.420186 543705 net.go:648] Add success.
I0319 22:21:13.423184 543705 net.go:770] primary dev: ETH0
I0319 22:21:13.423200 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:21:13.423215 543705 net.go:698] Add success.
I0319 22:21:13.490448 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"938596c9-201f-4ce2-9ecf-064faf97f510","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 22:21:13.490484 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 22:21:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:21:14.455098 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:21:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0319 22:21:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:21:14.456498 543705 disk_worker.go:494] system disk:vda1
I0319 22:21:14.456541 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:21:15.455983 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:21:16.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:21:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:21:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:21:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:21:21.269674 543705 disk_info.go:125] begin check local disk info of client
I0319 22:21:21.272110 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:21:21.272116 543705 disk_info.go:196] parse disk info done, disk is : [0xc000330100 0xc000330140]
E0319 22:21:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:21:23.409774 543705 memory.go:184] no items to output this cycle
I0319 22:21:23.409778 543705 cpu.go:275] no items to output this cycle
E0319 22:21:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:21:33.409805 543705 memory.go:184] no items to output this cycle
I0319 22:21:33.409819 543705 cpu.go:275] no items to output this cycle
I0319 22:21:37.925899 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 22:21:37.925907 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 22:21:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:21:43.410721 543705 memory.go:191] Add success.
I0319 22:21:43.409828 543705 cpu.go:282] Add success.
I0319 22:21:43.420430 543705 net.go:648] Add success.
I0319 22:21:43.423041 543705 net.go:770] primary dev: ETH0
I0319 22:21:43.423054 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:21:43.423067 543705 net.go:698] Add success.
I0319 22:21:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:21:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:21:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:21:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:21:53.409774 543705 memory.go:184] no items to output this cycle
I0319 22:21:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 22:22:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:22:03.409790 543705 memory.go:184] no items to output this cycle
I0319 22:22:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 22:22:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:22:13.409823 543705 memory.go:191] Add success.
I0319 22:22:13.409825 543705 cpu.go:282] Add success.
W0319 22:22:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:22:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:22:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:22:13.420213 543705 net.go:648] Add success.
I0319 22:22:13.422985 543705 net.go:770] primary dev: ETH0
I0319 22:22:13.423005 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:22:13.423019 543705 net.go:698] Add success.
W0319 22:22:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:22:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0319 22:22:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:22:14.456539 543705 disk_worker.go:494] system disk:vda1
I0319 22:22:14.456566 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 22:22:14.457875 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 22:22:14.457884 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 22:22:14.457891 543705 custom_config.go:64] query custom config with name: gpu
E0319 22:22:15.456829 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 22:22:15.456838 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:22:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 22:22:16.457978 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 22:22:16.458018 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:22:16.458035 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:22:16.472371 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:22:21.273672 543705 disk_info.go:125] begin check local disk info of client
I0319 22:22:21.276109 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:22:21.276115 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004600c0 0xc000460100]
E0319 22:22:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:22:23.409790 543705 memory.go:184] no items to output this cycle
I0319 22:22:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 22:22:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:22:33.409804 543705 memory.go:184] no items to output this cycle
I0319 22:22:33.409816 543705 cpu.go:275] no items to output this cycle
E0319 22:22:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:22:43.409793 543705 memory.go:191] Add success.
I0319 22:22:43.409811 543705 cpu.go:282] Add success.
I0319 22:22:43.419949 543705 net.go:648] Add success.
I0319 22:22:43.422904 543705 net.go:770] primary dev: ETH0
I0319 22:22:43.422917 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:22:43.422929 543705 net.go:698] Add success.
I0319 22:22:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:22:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:22:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:22:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:22:53.409773 543705 memory.go:184] no items to output this cycle
I0319 22:22:53.409808 543705 cpu.go:275] no items to output this cycle
E0319 22:23:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:23:03.409809 543705 memory.go:184] no items to output this cycle
I0319 22:23:03.409822 543705 cpu.go:275] no items to output this cycle
E0319 22:23:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:23:13.409780 543705 memory.go:191] Add success.
I0319 22:23:13.409801 543705 cpu.go:282] Add success.
W0319 22:23:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:23:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:23:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:23:13.420302 543705 net.go:648] Add success.
I0319 22:23:13.423137 543705 net.go:770] primary dev: ETH0
I0319 22:23:13.423150 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:23:13.423161 543705 net.go:698] Add success.
I0319 22:23:14.454978 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:23:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:23:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0319 22:23:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:23:14.456509 543705 disk_worker.go:494] system disk:vda1
I0319 22:23:14.456550 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:23:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:23:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:23:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:23:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:23:16.472421 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:23:21.277682 543705 disk_info.go:125] begin check local disk info of client
I0319 22:23:21.280197 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:23:21.280205 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039e040 0xc00039e080]
E0319 22:23:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:23:23.409775 543705 memory.go:184] no items to output this cycle
I0319 22:23:23.409787 543705 cpu.go:275] no items to output this cycle
E0319 22:23:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:23:33.409803 543705 memory.go:184] no items to output this cycle
I0319 22:23:33.409820 543705 cpu.go:275] no items to output this cycle
E0319 22:23:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:23:43.409821 543705 memory.go:191] Add success.
I0319 22:23:43.409830 543705 cpu.go:282] Add success.
I0319 22:23:43.420059 543705 net.go:648] Add success.
I0319 22:23:43.423224 543705 net.go:770] primary dev: ETH0
I0319 22:23:43.423237 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:23:43.423258 543705 net.go:698] Add success.
I0319 22:23:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:23:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:23:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:23:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:23:53.409771 543705 memory.go:184] no items to output this cycle
I0319 22:23:53.409801 543705 cpu.go:275] no items to output this cycle
E0319 22:24:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:24:03.409770 543705 memory.go:184] no items to output this cycle
I0319 22:24:03.409805 543705 cpu.go:275] no items to output this cycle
E0319 22:24:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:24:13.409788 543705 memory.go:191] Add success.
I0319 22:24:13.409808 543705 cpu.go:282] Add success.
W0319 22:24:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:24:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:24:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:24:13.420045 543705 net.go:648] Add success.
I0319 22:24:13.423018 543705 net.go:770] primary dev: ETH0
I0319 22:24:13.423032 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:24:13.423045 543705 net.go:698] Add success.
I0319 22:24:13.466476 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b1f1a33f-688d-4ced-95ff-f953f5e4b142","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 22:24:13.466508 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 22:24:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:24:14.455199 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:24:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0319 22:24:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:24:14.456619 543705 disk_worker.go:494] system disk:vda1
I0319 22:24:14.456652 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:24:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:24:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:24:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:24:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:24:16.472393 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:24:21.281672 543705 disk_info.go:125] begin check local disk info of client
I0319 22:24:21.284101 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:24:21.284107 543705 disk_info.go:196] parse disk info done, disk is : [0xc000278080 0xc000278100]
E0319 22:24:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:24:23.409796 543705 memory.go:184] no items to output this cycle
I0319 22:24:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 22:24:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:24:33.409778 543705 memory.go:184] no items to output this cycle
I0319 22:24:33.409809 543705 cpu.go:275] no items to output this cycle
I0319 22:24:37.926052 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 22:24:37.926059 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 22:24:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:24:43.410655 543705 memory.go:191] Add success.
I0319 22:24:43.409833 543705 cpu.go:282] Add success.
I0319 22:24:43.420351 543705 net.go:648] Add success.
I0319 22:24:43.423175 543705 net.go:770] primary dev: ETH0
I0319 22:24:43.423187 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:24:43.423200 543705 net.go:698] Add success.
I0319 22:24:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:24:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:24:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:24:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:24:53.409773 543705 memory.go:184] no items to output this cycle
I0319 22:24:53.409780 543705 cpu.go:275] no items to output this cycle
E0319 22:25:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:25:03.409787 543705 memory.go:184] no items to output this cycle
I0319 22:25:03.409792 543705 cpu.go:275] no items to output this cycle
E0319 22:25:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:25:13.409796 543705 memory.go:191] Add success.
I0319 22:25:13.409798 543705 cpu.go:282] Add success.
W0319 22:25:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:25:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:25:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:25:13.420300 543705 net.go:648] Add success.
I0319 22:25:13.423352 543705 net.go:770] primary dev: ETH0
I0319 22:25:13.423371 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:25:13.423384 543705 net.go:698] Add success.
I0319 22:25:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:25:14.455194 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:25:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0319 22:25:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:25:14.456610 543705 disk_worker.go:494] system disk:vda1
I0319 22:25:14.456640 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:25:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:25:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:25:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:25:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:25:16.472443 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:25:21.285672 543705 disk_info.go:125] begin check local disk info of client
I0319 22:25:21.288076 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:25:21.288083 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a62c0 0xc0004a6300]
E0319 22:25:23.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:25:23.409757 543705 memory.go:184] no items to output this cycle
I0319 22:25:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 22:25:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:25:33.409783 543705 memory.go:184] no items to output this cycle
I0319 22:25:33.409785 543705 cpu.go:275] no items to output this cycle
E0319 22:25:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:25:43.409823 543705 memory.go:191] Add success.
I0319 22:25:43.409842 543705 cpu.go:282] Add success.
I0319 22:25:43.419886 543705 net.go:648] Add success.
I0319 22:25:43.422719 543705 net.go:770] primary dev: ETH0
I0319 22:25:43.422734 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:25:43.422747 543705 net.go:698] Add success.
I0319 22:25:46.457924 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:25:46.457995 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:25:46.458023 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:25:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:25:53.409796 543705 memory.go:184] no items to output this cycle
I0319 22:25:53.409805 543705 cpu.go:275] no items to output this cycle
E0319 22:26:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:26:03.409770 543705 memory.go:184] no items to output this cycle
I0319 22:26:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 22:26:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:26:13.409820 543705 memory.go:191] Add success.
I0319 22:26:13.409822 543705 cpu.go:282] Add success.
W0319 22:26:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:26:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:26:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:26:13.420393 543705 net.go:648] Add success.
I0319 22:26:13.423111 543705 net.go:770] primary dev: ETH0
I0319 22:26:13.423124 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:26:13.423136 543705 net.go:698] Add success.
I0319 22:26:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:26:14.455191 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:26:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0319 22:26:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:26:14.456807 543705 disk_worker.go:494] system disk:vda1
I0319 22:26:14.456839 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:26:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:26:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:26:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:26:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:26:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:26:21.289686 543705 disk_info.go:125] begin check local disk info of client
I0319 22:26:21.292088 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:26:21.292094 543705 disk_info.go:196] parse disk info done, disk is : [0xc000386000 0xc000386040]
E0319 22:26:23.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:26:23.409761 543705 memory.go:184] no items to output this cycle
I0319 22:26:23.409797 543705 cpu.go:275] no items to output this cycle
E0319 22:26:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:26:33.409806 543705 memory.go:184] no items to output this cycle
I0319 22:26:33.409819 543705 cpu.go:275] no items to output this cycle
E0319 22:26:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:26:43.409796 543705 memory.go:191] Add success.
I0319 22:26:43.409814 543705 cpu.go:282] Add success.
I0319 22:26:43.420046 543705 net.go:648] Add success.
I0319 22:26:43.422734 543705 net.go:770] primary dev: ETH0
I0319 22:26:43.422747 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:26:43.422759 543705 net.go:698] Add success.
I0319 22:26:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:26:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:26:46.458089 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:26:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:26:53.409798 543705 memory.go:184] no items to output this cycle
I0319 22:26:53.409808 543705 cpu.go:275] no items to output this cycle
E0319 22:27:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:27:03.409811 543705 memory.go:184] no items to output this cycle
I0319 22:27:03.409819 543705 cpu.go:275] no items to output this cycle
E0319 22:27:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:27:13.409803 543705 memory.go:191] Add success.
I0319 22:27:13.409804 543705 cpu.go:282] Add success.
W0319 22:27:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:27:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:27:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:27:13.420122 543705 net.go:648] Add success.
I0319 22:27:13.422804 543705 net.go:770] primary dev: ETH0
I0319 22:27:13.422819 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:27:13.422833 543705 net.go:698] Add success.
I0319 22:27:13.428860 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 22:27:13.453028 543705 event_worker.go:152] Polling the log file for events...
I0319 22:27:13.468849 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"6f08d9e2-e605-4970-a93c-a214b9a560a0","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 22:27:13.468882 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 22:27:14.455194 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:27:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0319 22:27:14.455208 543705 disk_worker.go:728] disk inode is not compliant
E0319 22:27:14.455921 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 22:27:14.455930 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 22:27:14.455936 543705 custom_config.go:64] query custom config with name: gpu
I0319 22:27:14.456730 543705 disk_worker.go:494] system disk:vda1
I0319 22:27:14.456763 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 22:27:15.456856 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 22:27:15.456864 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:27:16.457947 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 22:27:16.457951 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 22:27:16.458004 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:27:16.458023 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:27:16.472355 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:27:21.293682 543705 disk_info.go:125] begin check local disk info of client
I0319 22:27:21.296063 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:27:21.296070 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039e3c0 0xc00039e400]
E0319 22:27:23.410387 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:27:23.410405 543705 memory.go:184] no items to output this cycle
I0319 22:27:23.410436 543705 cpu.go:275] no items to output this cycle
E0319 22:27:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:27:33.409795 543705 memory.go:184] no items to output this cycle
I0319 22:27:33.409797 543705 cpu.go:275] no items to output this cycle
I0319 22:27:37.926208 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 22:27:37.926216 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 22:27:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:27:43.410698 543705 memory.go:191] Add success.
I0319 22:27:43.409811 543705 cpu.go:282] Add success.
I0319 22:27:43.420420 543705 net.go:648] Add success.
I0319 22:27:43.423091 543705 net.go:770] primary dev: ETH0
I0319 22:27:43.423104 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:27:43.423118 543705 net.go:698] Add success.
I0319 22:27:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:27:46.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:27:46.458055 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:27:53.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:27:53.409813 543705 memory.go:184] no items to output this cycle
I0319 22:27:53.409821 543705 cpu.go:275] no items to output this cycle
E0319 22:28:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:28:03.409775 543705 memory.go:184] no items to output this cycle
I0319 22:28:03.409827 543705 cpu.go:275] no items to output this cycle
E0319 22:28:13.409834 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:28:13.409861 543705 memory.go:191] Add success.
I0319 22:28:13.409862 543705 cpu.go:282] Add success.
W0319 22:28:13.409889 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:28:13.409901 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:28:13.409904 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:28:13.420174 543705 net.go:648] Add success.
I0319 22:28:13.423315 543705 net.go:770] primary dev: ETH0
I0319 22:28:13.423330 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:28:13.423344 543705 net.go:698] Add success.
I0319 22:28:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:28:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:28:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0319 22:28:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:28:14.456497 543705 disk_worker.go:494] system disk:vda1
I0319 22:28:14.456543 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:28:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:28:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:28:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:28:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:28:16.472091 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:28:21.297681 543705 disk_info.go:125] begin check local disk info of client
I0319 22:28:21.300111 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:28:21.300118 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ae280 0xc0003ae2c0]
E0319 22:28:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:28:23.409806 543705 memory.go:184] no items to output this cycle
I0319 22:28:23.409819 543705 cpu.go:275] no items to output this cycle
E0319 22:28:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:28:33.409767 543705 memory.go:184] no items to output this cycle
I0319 22:28:33.409810 543705 cpu.go:275] no items to output this cycle
E0319 22:28:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:28:43.409802 543705 memory.go:191] Add success.
I0319 22:28:43.409820 543705 cpu.go:282] Add success.
I0319 22:28:43.419999 543705 net.go:648] Add success.
I0319 22:28:43.422689 543705 net.go:770] primary dev: ETH0
I0319 22:28:43.422702 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:28:43.422715 543705 net.go:698] Add success.
I0319 22:28:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:28:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:28:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:28:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:28:53.409804 543705 memory.go:184] no items to output this cycle
I0319 22:28:53.409817 543705 cpu.go:275] no items to output this cycle
E0319 22:29:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:29:03.409788 543705 memory.go:184] no items to output this cycle
I0319 22:29:03.409810 543705 cpu.go:275] no items to output this cycle
E0319 22:29:13.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:29:13.409826 543705 memory.go:191] Add success.
I0319 22:29:13.409829 543705 cpu.go:282] Add success.
W0319 22:29:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:29:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:29:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:29:13.420190 543705 net.go:648] Add success.
I0319 22:29:13.423071 543705 net.go:770] primary dev: ETH0
I0319 22:29:13.423088 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:29:13.423106 543705 net.go:698] Add success.
I0319 22:29:14.454955 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:29:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:29:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0319 22:29:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:29:14.456578 543705 disk_worker.go:494] system disk:vda1
I0319 22:29:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:29:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:29:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:29:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:29:16.458049 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:29:16.472461 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:29:21.301673 543705 disk_info.go:125] begin check local disk info of client
I0319 22:29:21.304151 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:29:21.304158 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035da80 0xc00035dac0]
E0319 22:29:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:29:23.409785 543705 memory.go:184] no items to output this cycle
I0319 22:29:23.409792 543705 cpu.go:275] no items to output this cycle
E0319 22:29:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:29:33.409809 543705 memory.go:184] no items to output this cycle
I0319 22:29:33.409822 543705 cpu.go:275] no items to output this cycle
E0319 22:29:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:29:43.409804 543705 memory.go:191] Add success.
I0319 22:29:43.409820 543705 cpu.go:282] Add success.
I0319 22:29:43.420053 543705 net.go:648] Add success.
I0319 22:29:43.422809 543705 net.go:770] primary dev: ETH0
I0319 22:29:43.422822 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:29:43.422835 543705 net.go:698] Add success.
I0319 22:29:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:29:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:29:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:29:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:29:53.409789 543705 memory.go:184] no items to output this cycle
I0319 22:29:53.409794 543705 cpu.go:275] no items to output this cycle
E0319 22:30:03.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:30:03.409763 543705 memory.go:184] no items to output this cycle
I0319 22:30:03.409804 543705 cpu.go:275] no items to output this cycle
E0319 22:30:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:30:13.409837 543705 memory.go:191] Add success.
I0319 22:30:13.409843 543705 cpu.go:282] Add success.
W0319 22:30:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:30:13.409885 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:30:13.409889 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:30:13.420137 543705 net.go:648] Add success.
I0319 22:30:13.423230 543705 net.go:770] primary dev: ETH0
I0319 22:30:13.423243 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:30:13.423256 543705 net.go:698] Add success.
I0319 22:30:13.478670 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d75154ba-4a6e-44cc-a869-e8c4ab0778c8","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 22:30:13.478704 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 22:30:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:30:14.455108 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:30:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0319 22:30:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:30:14.456503 543705 disk_worker.go:494] system disk:vda1
I0319 22:30:14.456544 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:30:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:30:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:30:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:30:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:30:16.472381 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:30:21.305679 543705 disk_info.go:125] begin check local disk info of client
I0319 22:30:21.308133 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:30:21.308140 543705 disk_info.go:196] parse disk info done, disk is : [0xc000328f00 0xc000328f40]
E0319 22:30:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:30:23.409792 543705 memory.go:184] no items to output this cycle
I0319 22:30:23.409808 543705 cpu.go:275] no items to output this cycle
E0319 22:30:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:30:33.409796 543705 memory.go:184] no items to output this cycle
I0319 22:30:33.409802 543705 cpu.go:275] no items to output this cycle
I0319 22:30:37.926359 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 22:30:37.926367 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 22:30:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:30:43.410671 543705 memory.go:191] Add success.
I0319 22:30:43.409820 543705 cpu.go:282] Add success.
I0319 22:30:43.420364 543705 net.go:648] Add success.
I0319 22:30:43.422982 543705 net.go:770] primary dev: ETH0
I0319 22:30:43.422996 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:30:43.423008 543705 net.go:698] Add success.
I0319 22:30:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:30:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:30:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:30:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:30:53.409786 543705 cpu.go:275] no items to output this cycle
I0319 22:30:53.409788 543705 memory.go:184] no items to output this cycle
E0319 22:31:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:31:03.409786 543705 memory.go:184] no items to output this cycle
I0319 22:31:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 22:31:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:31:13.409777 543705 memory.go:191] Add success.
I0319 22:31:13.409797 543705 cpu.go:282] Add success.
W0319 22:31:13.409802 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:31:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:31:13.409816 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:31:13.420053 543705 net.go:648] Add success.
I0319 22:31:13.422816 543705 net.go:770] primary dev: ETH0
I0319 22:31:13.422830 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:31:13.422843 543705 net.go:698] Add success.
I0319 22:31:14.454271 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:31:14.454485 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:31:14.454496 543705 disk_worker.go:708] disk space is not compliant
W0319 22:31:14.454499 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:31:14.455873 543705 disk_worker.go:494] system disk:vda1
I0319 22:31:14.455911 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:31:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:31:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:31:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:31:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:31:16.472771 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:31:21.309678 543705 disk_info.go:125] begin check local disk info of client
I0319 22:31:21.312122 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:31:21.312128 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002be7c0 0xc0002be800]
E0319 22:31:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:31:23.409792 543705 memory.go:184] no items to output this cycle
I0319 22:31:23.409806 543705 cpu.go:275] no items to output this cycle
E0319 22:31:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:31:33.409767 543705 memory.go:184] no items to output this cycle
I0319 22:31:33.409799 543705 cpu.go:275] no items to output this cycle
E0319 22:31:43.409891 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:31:43.409924 543705 memory.go:191] Add success.
I0319 22:31:43.410009 543705 cpu.go:282] Add success.
I0319 22:31:43.419735 543705 net.go:648] Add success.
I0319 22:31:43.422448 543705 net.go:770] primary dev: ETH0
I0319 22:31:43.422462 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:31:43.422473 543705 net.go:698] Add success.
I0319 22:31:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:31:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:31:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:31:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:31:53.409792 543705 memory.go:184] no items to output this cycle
I0319 22:31:53.409803 543705 cpu.go:275] no items to output this cycle
E0319 22:32:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:32:03.409770 543705 memory.go:184] no items to output this cycle
I0319 22:32:03.409792 543705 cpu.go:275] no items to output this cycle
E0319 22:32:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:32:13.409789 543705 memory.go:191] Add success.
I0319 22:32:13.409808 543705 cpu.go:282] Add success.
W0319 22:32:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:32:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:32:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:32:13.420145 543705 net.go:648] Add success.
I0319 22:32:13.422784 543705 net.go:770] primary dev: ETH0
I0319 22:32:13.422800 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:32:13.422814 543705 net.go:698] Add success.
W0319 22:32:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:32:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0319 22:32:14.455182 543705 disk_worker.go:728] disk inode is not compliant
E0319 22:32:14.456952 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 22:32:14.456961 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 22:32:14.456967 543705 custom_config.go:64] query custom config with name: gpu
I0319 22:32:14.457005 543705 disk_worker.go:494] system disk:vda1
I0319 22:32:14.457035 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 22:32:15.456844 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 22:32:15.456853 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:32:16.457910 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 22:32:16.457920 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 22:32:16.457963 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:32:16.457979 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:32:16.472340 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:32:21.313675 543705 disk_info.go:125] begin check local disk info of client
I0319 22:32:21.316191 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:32:21.316198 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003c9f00 0xc0003c9f40]
E0319 22:32:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:32:23.409778 543705 memory.go:184] no items to output this cycle
I0319 22:32:23.409783 543705 cpu.go:275] no items to output this cycle
E0319 22:32:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:32:33.409794 543705 memory.go:184] no items to output this cycle
I0319 22:32:33.409808 543705 cpu.go:275] no items to output this cycle
E0319 22:32:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:32:43.409906 543705 memory.go:191] Add success.
I0319 22:32:43.409985 543705 cpu.go:282] Add success.
I0319 22:32:43.419742 543705 net.go:648] Add success.
I0319 22:32:43.422431 543705 net.go:770] primary dev: ETH0
I0319 22:32:43.422446 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:32:43.422458 543705 net.go:698] Add success.
I0319 22:32:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:32:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:32:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:32:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:32:53.409771 543705 memory.go:184] no items to output this cycle
I0319 22:32:53.409786 543705 cpu.go:275] no items to output this cycle
E0319 22:33:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:33:03.409792 543705 memory.go:184] no items to output this cycle
I0319 22:33:03.409796 543705 cpu.go:275] no items to output this cycle
E0319 22:33:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:33:13.409794 543705 memory.go:191] Add success.
I0319 22:33:13.409796 543705 cpu.go:282] Add success.
W0319 22:33:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:33:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:33:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:33:13.420166 543705 net.go:648] Add success.
I0319 22:33:13.423003 543705 net.go:770] primary dev: ETH0
I0319 22:33:13.423016 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:33:13.423027 543705 net.go:698] Add success.
I0319 22:33:13.470366 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b4483fd6-2c4c-4384-876d-c1575275776b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 22:33:13.470398 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 22:33:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:33:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:33:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0319 22:33:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:33:14.456508 543705 disk_worker.go:494] system disk:vda1
I0319 22:33:14.456550 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:33:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:33:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:33:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:33:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:33:16.472482 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:33:21.317674 543705 disk_info.go:125] begin check local disk info of client
I0319 22:33:21.320269 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:33:21.320278 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032bac0 0xc00032bb00]
E0319 22:33:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:33:23.409794 543705 memory.go:184] no items to output this cycle
I0319 22:33:23.409814 543705 cpu.go:275] no items to output this cycle
E0319 22:33:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:33:33.409775 543705 memory.go:184] no items to output this cycle
I0319 22:33:33.409783 543705 cpu.go:275] no items to output this cycle
I0319 22:33:37.928335 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 22:33:37.928343 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 22:33:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:33:43.410715 543705 memory.go:191] Add success.
I0319 22:33:43.409814 543705 cpu.go:282] Add success.
I0319 22:33:43.420406 543705 net.go:648] Add success.
I0319 22:33:43.423136 543705 net.go:770] primary dev: ETH0
I0319 22:33:43.423150 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:33:43.423161 543705 net.go:698] Add success.
I0319 22:33:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:33:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:33:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:33:53.409910 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:33:53.409937 543705 memory.go:184] no items to output this cycle
I0319 22:33:53.410004 543705 cpu.go:275] no items to output this cycle
E0319 22:34:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:34:03.409775 543705 memory.go:184] no items to output this cycle
I0319 22:34:03.409789 543705 cpu.go:275] no items to output this cycle
E0319 22:34:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:34:13.409826 543705 memory.go:191] Add success.
I0319 22:34:13.409831 543705 cpu.go:282] Add success.
W0319 22:34:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:34:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:34:13.409879 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:34:13.420139 543705 net.go:648] Add success.
I0319 22:34:13.422770 543705 net.go:770] primary dev: ETH0
I0319 22:34:13.422783 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:34:13.422794 543705 net.go:698] Add success.
I0319 22:34:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:34:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:34:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0319 22:34:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:34:14.456498 543705 disk_worker.go:494] system disk:vda1
I0319 22:34:14.456527 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:34:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:34:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:34:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:34:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:34:16.472425 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:34:21.321675 543705 disk_info.go:125] begin check local disk info of client
I0319 22:34:21.324177 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:34:21.324183 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4cc0 0xc0000c4d00]
E0319 22:34:23.410414 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:34:23.410430 543705 memory.go:184] no items to output this cycle
I0319 22:34:23.410461 543705 cpu.go:275] no items to output this cycle
E0319 22:34:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:34:33.409773 543705 memory.go:184] no items to output this cycle
I0319 22:34:33.409806 543705 cpu.go:275] no items to output this cycle
E0319 22:34:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:34:43.409820 543705 memory.go:191] Add success.
I0319 22:34:43.409826 543705 cpu.go:282] Add success.
I0319 22:34:43.420039 543705 net.go:648] Add success.
I0319 22:34:43.422884 543705 net.go:770] primary dev: ETH0
I0319 22:34:43.422897 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:34:43.422910 543705 net.go:698] Add success.
I0319 22:34:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:34:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:34:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:34:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:34:53.409771 543705 memory.go:184] no items to output this cycle
I0319 22:34:53.409798 543705 cpu.go:275] no items to output this cycle
E0319 22:35:03.409868 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:35:03.409890 543705 memory.go:184] no items to output this cycle
I0319 22:35:03.409941 543705 cpu.go:275] no items to output this cycle
E0319 22:35:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:35:13.409816 543705 memory.go:191] Add success.
I0319 22:35:13.409826 543705 cpu.go:282] Add success.
W0319 22:35:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:35:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:35:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:35:13.420240 543705 net.go:648] Add success.
I0319 22:35:13.423145 543705 net.go:770] primary dev: ETH0
I0319 22:35:13.423159 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:35:13.423171 543705 net.go:698] Add success.
I0319 22:35:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:35:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:35:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0319 22:35:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:35:14.456587 543705 disk_worker.go:494] system disk:vda1
I0319 22:35:14.456616 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:35:15.455979 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:35:16.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:35:16.458071 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:35:16.458097 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:35:16.472485 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:35:21.327014 543705 disk_info.go:125] begin check local disk info of client
I0319 22:35:21.329568 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:35:21.329575 543705 disk_info.go:196] parse disk info done, disk is : [0xc000469bc0 0xc000469c00]
E0319 22:35:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:35:23.409792 543705 memory.go:184] no items to output this cycle
I0319 22:35:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 22:35:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:35:33.409800 543705 memory.go:184] no items to output this cycle
I0319 22:35:33.409812 543705 cpu.go:275] no items to output this cycle
E0319 22:35:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:35:43.409824 543705 memory.go:191] Add success.
I0319 22:35:43.409829 543705 cpu.go:282] Add success.
I0319 22:35:43.419959 543705 net.go:648] Add success.
I0319 22:35:43.422993 543705 net.go:770] primary dev: ETH0
I0319 22:35:43.423005 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:35:43.423018 543705 net.go:698] Add success.
I0319 22:35:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:35:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:35:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:35:53.410249 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:35:53.410267 543705 memory.go:184] no items to output this cycle
I0319 22:35:53.410279 543705 cpu.go:275] no items to output this cycle
E0319 22:36:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:36:03.409887 543705 cpu.go:275] no items to output this cycle
I0319 22:36:03.409903 543705 memory.go:184] no items to output this cycle
E0319 22:36:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:36:13.409816 543705 memory.go:191] Add success.
I0319 22:36:13.409830 543705 cpu.go:282] Add success.
W0319 22:36:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:36:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:36:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:36:13.420203 543705 net.go:648] Add success.
I0319 22:36:13.423018 543705 net.go:770] primary dev: ETH0
I0319 22:36:13.423032 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:36:13.423044 543705 net.go:698] Add success.
I0319 22:36:13.468981 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5e314af6-1795-44a1-8195-d48a2ae2f517","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 22:36:13.469016 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 22:36:14.453929 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:36:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:36:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0319 22:36:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:36:14.456573 543705 disk_worker.go:494] system disk:vda1
I0319 22:36:14.456633 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:36:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:36:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:36:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:36:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:36:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:36:21.329680 543705 disk_info.go:125] begin check local disk info of client
I0319 22:36:21.332115 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:36:21.332121 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4480 0xc0000c4540]
E0319 22:36:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:36:23.409789 543705 memory.go:184] no items to output this cycle
I0319 22:36:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 22:36:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:36:33.409780 543705 memory.go:184] no items to output this cycle
I0319 22:36:33.409784 543705 cpu.go:275] no items to output this cycle
I0319 22:36:37.929735 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 22:36:37.929743 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 22:36:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:36:43.410719 543705 memory.go:191] Add success.
I0319 22:36:43.409803 543705 cpu.go:282] Add success.
I0319 22:36:43.420423 543705 net.go:648] Add success.
I0319 22:36:43.423596 543705 net.go:770] primary dev: ETH0
I0319 22:36:43.423609 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:36:43.423621 543705 net.go:698] Add success.
I0319 22:36:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:36:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:36:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:36:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:36:53.409789 543705 cpu.go:275] no items to output this cycle
I0319 22:36:53.409796 543705 memory.go:184] no items to output this cycle
E0319 22:37:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:37:03.409793 543705 memory.go:184] no items to output this cycle
I0319 22:37:03.409800 543705 cpu.go:275] no items to output this cycle
I0319 22:37:13.409805 543705 cpu.go:282] Add success.
E0319 22:37:13.409811 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:37:13.409841 543705 memory.go:191] Add success.
W0319 22:37:13.409875 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:37:13.409897 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:37:13.409902 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:37:13.420224 543705 net.go:648] Add success.
I0319 22:37:13.423182 543705 net.go:770] primary dev: ETH0
I0319 22:37:13.423196 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:37:13.423207 543705 net.go:698] Add success.
I0319 22:37:13.452809 543705 event_worker.go:152] Polling the log file for events...
W0319 22:37:14.455130 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:37:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0319 22:37:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:37:14.456865 543705 disk_worker.go:494] system disk:vda1
I0319 22:37:14.456908 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 22:37:14.457178 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 22:37:14.457186 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 22:37:14.457190 543705 custom_config.go:64] query custom config with name: gpu
E0319 22:37:15.456810 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 22:37:15.456821 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:37:16.457938 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 22:37:16.457938 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 22:37:16.457993 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:37:16.458013 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:37:16.472498 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:37:21.333673 543705 disk_info.go:125] begin check local disk info of client
I0319 22:37:21.336053 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:37:21.336059 543705 disk_info.go:196] parse disk info done, disk is : [0xc000466b80 0xc000466bc0]
E0319 22:37:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:37:23.409788 543705 memory.go:184] no items to output this cycle
I0319 22:37:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 22:37:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:37:33.409767 543705 memory.go:184] no items to output this cycle
I0319 22:37:33.409806 543705 cpu.go:275] no items to output this cycle
E0319 22:37:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:37:43.409800 543705 memory.go:191] Add success.
I0319 22:37:43.409824 543705 cpu.go:282] Add success.
I0319 22:37:43.419879 543705 net.go:648] Add success.
I0319 22:37:43.422749 543705 net.go:770] primary dev: ETH0
I0319 22:37:43.422769 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:37:43.422786 543705 net.go:698] Add success.
I0319 22:37:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:37:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:37:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:37:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:37:53.409782 543705 memory.go:184] no items to output this cycle
I0319 22:37:53.409823 543705 cpu.go:275] no items to output this cycle
E0319 22:38:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:38:03.409891 543705 memory.go:184] no items to output this cycle
I0319 22:38:03.409910 543705 cpu.go:275] no items to output this cycle
W0319 22:38:13.409726 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:38:13.409743 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:38:13.409748 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:38:13.409818 543705 cpu.go:282] Add success.
E0319 22:38:13.409826 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:38:13.409844 543705 memory.go:191] Add success.
I0319 22:38:13.420293 543705 net.go:648] Add success.
I0319 22:38:13.423248 543705 net.go:770] primary dev: ETH0
I0319 22:38:13.423262 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:38:13.423276 543705 net.go:698] Add success.
I0319 22:38:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:38:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:38:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0319 22:38:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:38:14.456575 543705 disk_worker.go:494] system disk:vda1
I0319 22:38:14.456606 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:38:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:38:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:38:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:38:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:38:16.472362 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:38:21.337678 543705 disk_info.go:125] begin check local disk info of client
I0319 22:38:21.340092 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:38:21.340098 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007aec0 0xc00007af00]
E0319 22:38:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:38:23.409802 543705 memory.go:184] no items to output this cycle
I0319 22:38:23.409815 543705 cpu.go:275] no items to output this cycle
E0319 22:38:33.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:38:33.409814 543705 memory.go:184] no items to output this cycle
I0319 22:38:33.409824 543705 cpu.go:275] no items to output this cycle
E0319 22:38:43.409802 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:38:43.409838 543705 memory.go:191] Add success.
I0319 22:38:43.409841 543705 cpu.go:282] Add success.
I0319 22:38:43.419987 543705 net.go:648] Add success.
I0319 22:38:43.423024 543705 net.go:770] primary dev: ETH0
I0319 22:38:43.423038 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:38:43.423051 543705 net.go:698] Add success.
I0319 22:38:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:38:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:38:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:38:53.410396 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:38:53.410413 543705 memory.go:184] no items to output this cycle
I0319 22:38:53.410447 543705 cpu.go:275] no items to output this cycle
E0319 22:39:03.409919 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:39:03.409928 543705 cpu.go:275] no items to output this cycle
I0319 22:39:03.409938 543705 memory.go:184] no items to output this cycle
E0319 22:39:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:39:13.409809 543705 memory.go:191] Add success.
I0319 22:39:13.409810 543705 cpu.go:282] Add success.
W0319 22:39:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:39:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:39:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:39:13.420182 543705 net.go:648] Add success.
I0319 22:39:13.423015 543705 net.go:770] primary dev: ETH0
I0319 22:39:13.423028 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:39:13.423040 543705 net.go:698] Add success.
I0319 22:39:13.468795 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1fdbc050-3e49-4779-88c8-3d05af260123","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 22:39:13.468848 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 22:39:14.454978 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:39:14.455111 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:39:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0319 22:39:14.455180 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:39:14.456520 543705 disk_worker.go:494] system disk:vda1
I0319 22:39:14.456574 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:39:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:39:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:39:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:39:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:39:16.472460 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:39:21.341672 543705 disk_info.go:125] begin check local disk info of client
I0319 22:39:21.344108 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:39:21.344114 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa700 0xc0001aa740]
E0319 22:39:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:39:23.409790 543705 memory.go:184] no items to output this cycle
I0319 22:39:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 22:39:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:39:33.409782 543705 memory.go:184] no items to output this cycle
I0319 22:39:33.409799 543705 cpu.go:275] no items to output this cycle
I0319 22:39:37.929900 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 22:39:37.929909 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 22:39:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:39:43.410574 543705 memory.go:191] Add success.
I0319 22:39:43.409827 543705 cpu.go:282] Add success.
I0319 22:39:43.420257 543705 net.go:648] Add success.
I0319 22:39:43.423097 543705 net.go:770] primary dev: ETH0
I0319 22:39:43.423112 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:39:43.423125 543705 net.go:698] Add success.
I0319 22:39:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:39:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:39:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:39:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:39:53.409773 543705 memory.go:184] no items to output this cycle
I0319 22:39:53.409780 543705 cpu.go:275] no items to output this cycle
E0319 22:40:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:40:03.409781 543705 memory.go:184] no items to output this cycle
I0319 22:40:03.409779 543705 cpu.go:275] no items to output this cycle
E0319 22:40:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:40:13.409828 543705 memory.go:191] Add success.
I0319 22:40:13.409835 543705 cpu.go:282] Add success.
W0319 22:40:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:40:13.409875 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:40:13.409879 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:40:13.420188 543705 net.go:648] Add success.
I0319 22:40:13.422978 543705 net.go:770] primary dev: ETH0
I0319 22:40:13.422991 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:40:13.423003 543705 net.go:698] Add success.
I0319 22:40:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:40:14.455152 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:40:14.455163 543705 disk_worker.go:708] disk space is not compliant
W0319 22:40:14.455166 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:40:14.456513 543705 disk_worker.go:494] system disk:vda1
I0319 22:40:14.456560 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:40:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:40:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:40:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:40:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:40:16.472464 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:40:21.345672 543705 disk_info.go:125] begin check local disk info of client
I0319 22:40:21.348128 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:40:21.348134 543705 disk_info.go:196] parse disk info done, disk is : [0xc00027bb80 0xc00027bbc0]
E0319 22:40:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:40:23.409759 543705 memory.go:184] no items to output this cycle
I0319 22:40:23.409794 543705 cpu.go:275] no items to output this cycle
E0319 22:40:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:40:33.409810 543705 memory.go:184] no items to output this cycle
I0319 22:40:33.409819 543705 cpu.go:275] no items to output this cycle
E0319 22:40:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:40:43.409788 543705 memory.go:191] Add success.
I0319 22:40:43.409817 543705 cpu.go:282] Add success.
I0319 22:40:43.419903 543705 net.go:648] Add success.
I0319 22:40:43.422924 543705 net.go:770] primary dev: ETH0
I0319 22:40:43.422940 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:40:43.422956 543705 net.go:698] Add success.
I0319 22:40:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:40:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:40:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:40:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:40:53.409768 543705 memory.go:184] no items to output this cycle
I0319 22:40:53.409799 543705 cpu.go:275] no items to output this cycle
E0319 22:41:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:41:03.409811 543705 memory.go:184] no items to output this cycle
I0319 22:41:03.409823 543705 cpu.go:275] no items to output this cycle
E0319 22:41:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:41:13.409863 543705 memory.go:191] Add success.
W0319 22:41:13.409895 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:41:13.409909 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:41:13.409949 543705 cpu.go:282] Add success.
I0319 22:41:13.409912 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:41:13.419727 543705 net.go:648] Add success.
I0319 22:41:13.422460 543705 net.go:770] primary dev: ETH0
I0319 22:41:13.422474 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:41:13.422485 543705 net.go:698] Add success.
I0319 22:41:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:41:14.455147 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:41:14.455157 543705 disk_worker.go:708] disk space is not compliant
W0319 22:41:14.455160 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:41:14.456495 543705 disk_worker.go:494] system disk:vda1
I0319 22:41:14.456536 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:41:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:41:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:41:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:41:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:41:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:41:21.349674 543705 disk_info.go:125] begin check local disk info of client
I0319 22:41:21.352115 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:41:21.352121 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037fd40 0xc00037fd80]
E0319 22:41:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:41:23.409788 543705 memory.go:184] no items to output this cycle
I0319 22:41:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 22:41:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:41:33.409770 543705 memory.go:184] no items to output this cycle
I0319 22:41:33.409805 543705 cpu.go:275] no items to output this cycle
E0319 22:41:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:41:43.409826 543705 memory.go:191] Add success.
I0319 22:41:43.409829 543705 cpu.go:282] Add success.
I0319 22:41:43.419985 543705 net.go:648] Add success.
I0319 22:41:43.422874 543705 net.go:770] primary dev: ETH0
I0319 22:41:43.422887 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:41:43.422900 543705 net.go:698] Add success.
I0319 22:41:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:41:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:41:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:41:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:41:53.409779 543705 memory.go:184] no items to output this cycle
I0319 22:41:53.409783 543705 cpu.go:275] no items to output this cycle
E0319 22:42:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:42:03.409800 543705 memory.go:184] no items to output this cycle
I0319 22:42:03.409809 543705 cpu.go:275] no items to output this cycle
E0319 22:42:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:42:13.409813 543705 memory.go:191] Add success.
I0319 22:42:13.409824 543705 cpu.go:282] Add success.
W0319 22:42:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:42:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:42:13.409860 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:42:13.420296 543705 net.go:648] Add success.
I0319 22:42:13.423454 543705 net.go:770] primary dev: ETH0
I0319 22:42:13.423468 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:42:13.423479 543705 net.go:698] Add success.
W0319 22:42:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:42:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0319 22:42:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:42:14.456784 543705 disk_worker.go:494] system disk:vda1
I0319 22:42:14.456821 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 22:42:14.457122 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 22:42:14.457130 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 22:42:14.457134 543705 custom_config.go:64] query custom config with name: gpu
I0319 22:42:15.425990 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"01ced21f-85ab-42ef-a2bb-fa64ab921bc2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 22:42:15.426031 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
E0319 22:42:15.456182 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 22:42:15.456190 543705 custom_config.go:64] query custom config with name: huawei_npu
E0319 22:42:16.457503 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 22:42:16.458560 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:42:16.458614 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:42:16.458631 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:42:16.472958 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:42:21.353674 543705 disk_info.go:125] begin check local disk info of client
I0319 22:42:21.356092 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:42:21.356098 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047d900 0xc00047d940]
E0319 22:42:23.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:42:23.409809 543705 memory.go:184] no items to output this cycle
I0319 22:42:23.409821 543705 cpu.go:275] no items to output this cycle
E0319 22:42:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:42:33.409796 543705 memory.go:184] no items to output this cycle
I0319 22:42:33.409809 543705 cpu.go:275] no items to output this cycle
I0319 22:42:37.930054 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 22:42:37.930062 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 22:42:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:42:43.410644 543705 memory.go:191] Add success.
I0319 22:42:43.409808 543705 cpu.go:282] Add success.
I0319 22:42:43.420346 543705 net.go:648] Add success.
I0319 22:42:43.423186 543705 net.go:770] primary dev: ETH0
I0319 22:42:43.423200 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:42:43.423229 543705 net.go:698] Add success.
I0319 22:42:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:42:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:42:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:42:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:42:53.409779 543705 memory.go:184] no items to output this cycle
I0319 22:42:53.409799 543705 cpu.go:275] no items to output this cycle
E0319 22:43:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:43:03.409783 543705 memory.go:184] no items to output this cycle
I0319 22:43:03.409788 543705 cpu.go:275] no items to output this cycle
E0319 22:43:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:43:13.409798 543705 memory.go:191] Add success.
I0319 22:43:13.409799 543705 cpu.go:282] Add success.
W0319 22:43:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:43:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:43:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:43:13.420307 543705 net.go:648] Add success.
I0319 22:43:13.423066 543705 net.go:770] primary dev: ETH0
I0319 22:43:13.423079 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:43:13.423090 543705 net.go:698] Add success.
I0319 22:43:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:43:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:43:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0319 22:43:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:43:14.456593 543705 disk_worker.go:494] system disk:vda1
I0319 22:43:14.456623 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:43:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:43:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:43:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:43:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:43:16.472380 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:43:21.357676 543705 disk_info.go:125] begin check local disk info of client
I0319 22:43:21.360168 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:43:21.360175 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ac400 0xc0003ac440]
E0319 22:43:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:43:23.409767 543705 memory.go:184] no items to output this cycle
I0319 22:43:23.409786 543705 cpu.go:275] no items to output this cycle
E0319 22:43:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:43:33.409769 543705 memory.go:184] no items to output this cycle
I0319 22:43:33.409791 543705 cpu.go:275] no items to output this cycle
E0319 22:43:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:43:43.409827 543705 memory.go:191] Add success.
I0319 22:43:43.409831 543705 cpu.go:282] Add success.
I0319 22:43:43.419851 543705 net.go:648] Add success.
I0319 22:43:43.422582 543705 net.go:770] primary dev: ETH0
I0319 22:43:43.422597 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:43:43.422613 543705 net.go:698] Add success.
I0319 22:43:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:43:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:43:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:43:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:43:53.409802 543705 memory.go:184] no items to output this cycle
I0319 22:43:53.409814 543705 cpu.go:275] no items to output this cycle
E0319 22:44:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:44:03.409786 543705 cpu.go:275] no items to output this cycle
I0319 22:44:03.409790 543705 memory.go:184] no items to output this cycle
E0319 22:44:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:44:13.409796 543705 memory.go:191] Add success.
I0319 22:44:13.409795 543705 cpu.go:282] Add success.
W0319 22:44:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:44:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:44:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:44:13.420258 543705 net.go:648] Add success.
I0319 22:44:13.423079 543705 net.go:770] primary dev: ETH0
I0319 22:44:13.423095 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:44:13.423108 543705 net.go:698] Add success.
I0319 22:44:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:44:14.455098 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:44:14.455160 543705 disk_worker.go:708] disk space is not compliant
W0319 22:44:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:44:14.456534 543705 disk_worker.go:494] system disk:vda1
I0319 22:44:14.456575 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:44:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:44:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:44:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:44:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:44:16.472363 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:44:21.361673 543705 disk_info.go:125] begin check local disk info of client
I0319 22:44:21.364152 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:44:21.364158 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ba300 0xc0003ba340]
E0319 22:44:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:44:23.409793 543705 memory.go:184] no items to output this cycle
I0319 22:44:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 22:44:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:44:33.409780 543705 cpu.go:275] no items to output this cycle
I0319 22:44:33.409783 543705 memory.go:184] no items to output this cycle
E0319 22:44:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:44:43.409791 543705 memory.go:191] Add success.
I0319 22:44:43.409794 543705 cpu.go:282] Add success.
I0319 22:44:43.420424 543705 net.go:648] Add success.
I0319 22:44:43.423241 543705 net.go:770] primary dev: ETH0
I0319 22:44:43.423254 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:44:43.423267 543705 net.go:698] Add success.
I0319 22:44:46.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:44:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:44:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:44:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:44:53.409765 543705 memory.go:184] no items to output this cycle
I0319 22:44:53.409794 543705 cpu.go:275] no items to output this cycle
E0319 22:45:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:45:03.409801 543705 memory.go:184] no items to output this cycle
I0319 22:45:03.409816 543705 cpu.go:275] no items to output this cycle
E0319 22:45:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:45:13.409786 543705 memory.go:191] Add success.
I0319 22:45:13.409801 543705 cpu.go:282] Add success.
W0319 22:45:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:45:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:45:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:45:13.420139 543705 net.go:648] Add success.
I0319 22:45:13.422627 543705 net.go:770] primary dev: ETH0
I0319 22:45:13.422643 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:45:13.422657 543705 net.go:698] Add success.
I0319 22:45:13.464112 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"bd3567da-31e7-438a-bce6-fb4335468cd5","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 22:45:13.464144 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 22:45:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:45:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:45:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0319 22:45:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:45:14.456616 543705 disk_worker.go:494] system disk:vda1
I0319 22:45:14.456647 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:45:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:45:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:45:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:45:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:45:16.472375 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:45:21.365673 543705 disk_info.go:125] begin check local disk info of client
I0319 22:45:21.368094 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:45:21.368100 543705 disk_info.go:196] parse disk info done, disk is : [0xc000328080 0xc0003280c0]
E0319 22:45:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:45:23.409788 543705 memory.go:184] no items to output this cycle
I0319 22:45:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 22:45:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:45:33.409789 543705 memory.go:184] no items to output this cycle
I0319 22:45:33.409794 543705 cpu.go:275] no items to output this cycle
I0319 22:45:37.930208 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 22:45:37.930216 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 22:45:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:45:43.410728 543705 memory.go:191] Add success.
I0319 22:45:43.409811 543705 cpu.go:282] Add success.
I0319 22:45:43.420440 543705 net.go:648] Add success.
I0319 22:45:43.423129 543705 net.go:770] primary dev: ETH0
I0319 22:45:43.423142 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:45:43.423154 543705 net.go:698] Add success.
I0319 22:45:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:45:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:45:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:45:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:45:53.409768 543705 memory.go:184] no items to output this cycle
I0319 22:45:53.409788 543705 cpu.go:275] no items to output this cycle
E0319 22:46:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:46:03.409784 543705 memory.go:184] no items to output this cycle
I0319 22:46:03.409785 543705 cpu.go:275] no items to output this cycle
E0319 22:46:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:46:13.409795 543705 memory.go:191] Add success.
I0319 22:46:13.409798 543705 cpu.go:282] Add success.
W0319 22:46:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:46:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:46:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:46:13.420056 543705 net.go:648] Add success.
I0319 22:46:13.422702 543705 net.go:770] primary dev: ETH0
I0319 22:46:13.422718 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:46:13.422732 543705 net.go:698] Add success.
I0319 22:46:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:46:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:46:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 22:46:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:46:14.456566 543705 disk_worker.go:494] system disk:vda1
I0319 22:46:14.456598 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:46:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:46:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:46:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:46:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:46:16.472398 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:46:21.369672 543705 disk_info.go:125] begin check local disk info of client
I0319 22:46:21.372074 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:46:21.372081 543705 disk_info.go:196] parse disk info done, disk is : [0xc00024c740 0xc00024c780]
E0319 22:46:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:46:23.409792 543705 memory.go:184] no items to output this cycle
I0319 22:46:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 22:46:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:46:33.409775 543705 memory.go:184] no items to output this cycle
I0319 22:46:33.409800 543705 cpu.go:275] no items to output this cycle
E0319 22:46:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:46:43.409794 543705 memory.go:191] Add success.
I0319 22:46:43.409809 543705 cpu.go:282] Add success.
I0319 22:46:43.419956 543705 net.go:648] Add success.
I0319 22:46:43.422679 543705 net.go:770] primary dev: ETH0
I0319 22:46:43.422694 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:46:43.422708 543705 net.go:698] Add success.
I0319 22:46:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:46:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:46:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:46:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:46:53.409763 543705 memory.go:184] no items to output this cycle
I0319 22:46:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 22:47:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:47:03.409812 543705 memory.go:184] no items to output this cycle
I0319 22:47:03.409823 543705 cpu.go:275] no items to output this cycle
E0319 22:47:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:47:13.409778 543705 memory.go:191] Add success.
I0319 22:47:13.409804 543705 cpu.go:282] Add success.
W0319 22:47:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:47:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:47:13.409819 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:47:13.420278 543705 net.go:648] Add success.
I0319 22:47:13.422987 543705 net.go:770] primary dev: ETH0
I0319 22:47:13.423005 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:47:13.423025 543705 net.go:698] Add success.
I0319 22:47:13.453590 543705 event_worker.go:152] Polling the log file for events...
W0319 22:47:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:47:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 22:47:14.455190 543705 disk_worker.go:728] disk inode is not compliant
E0319 22:47:14.455903 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 22:47:14.455912 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 22:47:14.455918 543705 custom_config.go:64] query custom config with name: gpu
I0319 22:47:14.456562 543705 disk_worker.go:494] system disk:vda1
I0319 22:47:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 22:47:15.456833 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 22:47:15.456843 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:47:16.457954 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 22:47:16.457954 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 22:47:16.458009 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:47:16.458028 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:47:16.472363 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:47:21.373671 543705 disk_info.go:125] begin check local disk info of client
I0319 22:47:21.376148 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:47:21.376154 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ac100 0xc0003ac140]
E0319 22:47:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:47:23.409761 543705 memory.go:184] no items to output this cycle
I0319 22:47:23.409792 543705 cpu.go:275] no items to output this cycle
E0319 22:47:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:47:33.409771 543705 memory.go:184] no items to output this cycle
I0319 22:47:33.409790 543705 cpu.go:275] no items to output this cycle
E0319 22:47:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:47:43.409822 543705 memory.go:191] Add success.
I0319 22:47:43.409824 543705 cpu.go:282] Add success.
I0319 22:47:43.419955 543705 net.go:648] Add success.
I0319 22:47:43.422790 543705 net.go:770] primary dev: ETH0
I0319 22:47:43.422805 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:47:43.422818 543705 net.go:698] Add success.
I0319 22:47:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:47:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:47:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:47:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:47:53.409765 543705 memory.go:184] no items to output this cycle
I0319 22:47:53.409785 543705 cpu.go:275] no items to output this cycle
E0319 22:48:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:48:03.409791 543705 memory.go:184] no items to output this cycle
I0319 22:48:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 22:48:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:48:13.409817 543705 memory.go:191] Add success.
I0319 22:48:13.409821 543705 cpu.go:282] Add success.
W0319 22:48:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:48:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:48:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:48:13.420157 543705 net.go:648] Add success.
I0319 22:48:13.423104 543705 net.go:770] primary dev: ETH0
I0319 22:48:13.423119 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:48:13.423133 543705 net.go:698] Add success.
I0319 22:48:13.469765 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"246b2a4e-e308-48cc-b25c-716bef04f25e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 22:48:13.469801 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 22:48:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:48:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:48:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0319 22:48:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:48:14.456580 543705 disk_worker.go:494] system disk:vda1
I0319 22:48:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:48:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:48:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:48:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:48:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:48:16.472404 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:48:21.377672 543705 disk_info.go:125] begin check local disk info of client
I0319 22:48:21.380123 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:48:21.380131 543705 disk_info.go:196] parse disk info done, disk is : [0xc000484380 0xc0004843c0]
E0319 22:48:23.409855 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:48:23.409878 543705 memory.go:184] no items to output this cycle
I0319 22:48:23.410028 543705 cpu.go:275] no items to output this cycle
E0319 22:48:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:48:33.409778 543705 memory.go:184] no items to output this cycle
I0319 22:48:33.409794 543705 cpu.go:275] no items to output this cycle
I0319 22:48:37.932364 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 22:48:37.932371 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 22:48:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:48:43.410946 543705 memory.go:191] Add success.
I0319 22:48:43.409824 543705 cpu.go:282] Add success.
I0319 22:48:43.419703 543705 net.go:648] Add success.
I0319 22:48:43.422458 543705 net.go:770] primary dev: ETH0
I0319 22:48:43.422471 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:48:43.422484 543705 net.go:698] Add success.
I0319 22:48:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:48:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:48:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:48:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:48:53.409779 543705 memory.go:184] no items to output this cycle
I0319 22:48:53.409784 543705 cpu.go:275] no items to output this cycle
E0319 22:49:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:49:03.409808 543705 memory.go:184] no items to output this cycle
I0319 22:49:03.409821 543705 cpu.go:275] no items to output this cycle
E0319 22:49:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:49:13.409780 543705 memory.go:191] Add success.
I0319 22:49:13.409801 543705 cpu.go:282] Add success.
W0319 22:49:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:49:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:49:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:49:13.420052 543705 net.go:648] Add success.
I0319 22:49:13.422864 543705 net.go:770] primary dev: ETH0
I0319 22:49:13.422879 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:49:13.422894 543705 net.go:698] Add success.
I0319 22:49:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:49:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:49:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0319 22:49:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:49:14.456573 543705 disk_worker.go:494] system disk:vda1
I0319 22:49:14.456602 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:49:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:49:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:49:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:49:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:49:16.472460 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:49:21.381675 543705 disk_info.go:125] begin check local disk info of client
I0319 22:49:21.384075 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:49:21.384082 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a5600 0xc0002a5640]
E0319 22:49:23.409868 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:49:23.409885 543705 memory.go:184] no items to output this cycle
I0319 22:49:23.409929 543705 cpu.go:275] no items to output this cycle
E0319 22:49:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:49:33.409799 543705 memory.go:184] no items to output this cycle
I0319 22:49:33.409813 543705 cpu.go:275] no items to output this cycle
E0319 22:49:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:49:43.409799 543705 memory.go:191] Add success.
I0319 22:49:43.409803 543705 cpu.go:282] Add success.
I0319 22:49:43.419975 543705 net.go:648] Add success.
I0319 22:49:43.422668 543705 net.go:770] primary dev: ETH0
I0319 22:49:43.422680 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:49:43.422693 543705 net.go:698] Add success.
I0319 22:49:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:49:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:49:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:49:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:49:53.409776 543705 memory.go:184] no items to output this cycle
I0319 22:49:53.409776 543705 cpu.go:275] no items to output this cycle
E0319 22:50:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:50:03.409810 543705 memory.go:184] no items to output this cycle
I0319 22:50:03.409821 543705 cpu.go:275] no items to output this cycle
E0319 22:50:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:50:13.409794 543705 memory.go:191] Add success.
I0319 22:50:13.409813 543705 cpu.go:282] Add success.
W0319 22:50:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:50:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:50:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:50:13.420106 543705 net.go:648] Add success.
I0319 22:50:13.422557 543705 net.go:770] primary dev: ETH0
I0319 22:50:13.422570 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:50:13.422582 543705 net.go:698] Add success.
I0319 22:50:14.454985 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:50:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:50:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0319 22:50:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:50:14.456566 543705 disk_worker.go:494] system disk:vda1
I0319 22:50:14.456595 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:50:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:50:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:50:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:50:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:50:16.472443 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:50:21.385672 543705 disk_info.go:125] begin check local disk info of client
I0319 22:50:21.388101 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:50:21.388107 543705 disk_info.go:196] parse disk info done, disk is : [0xc000369980 0xc0003699c0]
E0319 22:50:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:50:23.409800 543705 memory.go:184] no items to output this cycle
I0319 22:50:23.409815 543705 cpu.go:275] no items to output this cycle
E0319 22:50:33.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:50:33.409821 543705 memory.go:184] no items to output this cycle
I0319 22:50:33.409832 543705 cpu.go:275] no items to output this cycle
E0319 22:50:43.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:50:43.409833 543705 memory.go:191] Add success.
I0319 22:50:43.409835 543705 cpu.go:282] Add success.
I0319 22:50:43.420027 543705 net.go:648] Add success.
I0319 22:50:43.422684 543705 net.go:770] primary dev: ETH0
I0319 22:50:43.422699 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:50:43.422713 543705 net.go:698] Add success.
I0319 22:50:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:50:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:50:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:50:53.409806 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:50:53.409826 543705 memory.go:184] no items to output this cycle
I0319 22:50:53.409834 543705 cpu.go:275] no items to output this cycle
E0319 22:51:03.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:51:03.409822 543705 memory.go:184] no items to output this cycle
I0319 22:51:03.409835 543705 cpu.go:275] no items to output this cycle
E0319 22:51:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:51:13.409791 543705 memory.go:191] Add success.
I0319 22:51:13.409811 543705 cpu.go:282] Add success.
W0319 22:51:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:51:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:51:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:51:13.420052 543705 net.go:648] Add success.
I0319 22:51:13.422953 543705 net.go:770] primary dev: ETH0
I0319 22:51:13.422965 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:51:13.422977 543705 net.go:698] Add success.
I0319 22:51:13.468504 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4bdabf63-2457-4008-8c07-2cc69cc318a3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 22:51:13.468536 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 22:51:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:51:14.455164 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:51:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0319 22:51:14.455179 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:51:14.456537 543705 disk_worker.go:494] system disk:vda1
I0319 22:51:14.456592 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:51:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:51:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:51:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:51:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:51:16.472374 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:51:21.391996 543705 disk_info.go:125] begin check local disk info of client
I0319 22:51:21.394482 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:51:21.394488 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039f2c0 0xc00039f300]
E0319 22:51:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:51:23.409761 543705 memory.go:184] no items to output this cycle
I0319 22:51:23.409879 543705 cpu.go:275] no items to output this cycle
E0319 22:51:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:51:33.409806 543705 memory.go:184] no items to output this cycle
I0319 22:51:33.409826 543705 cpu.go:275] no items to output this cycle
I0319 22:51:37.933734 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 22:51:37.933742 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 22:51:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:51:43.410689 543705 memory.go:191] Add success.
I0319 22:51:43.409803 543705 cpu.go:282] Add success.
I0319 22:51:43.420430 543705 net.go:648] Add success.
I0319 22:51:43.423285 543705 net.go:770] primary dev: ETH0
I0319 22:51:43.423300 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:51:43.423315 543705 net.go:698] Add success.
I0319 22:51:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:51:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:51:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:51:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:51:53.409769 543705 memory.go:184] no items to output this cycle
I0319 22:51:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 22:52:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:52:03.409780 543705 memory.go:184] no items to output this cycle
I0319 22:52:03.409791 543705 cpu.go:275] no items to output this cycle
E0319 22:52:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:52:13.409801 543705 memory.go:191] Add success.
I0319 22:52:13.409801 543705 cpu.go:282] Add success.
W0319 22:52:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:52:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:52:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:52:13.420075 543705 net.go:648] Add success.
I0319 22:52:13.423205 543705 net.go:770] primary dev: ETH0
I0319 22:52:13.423219 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:52:13.423232 543705 net.go:698] Add success.
W0319 22:52:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:52:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0319 22:52:14.455191 543705 disk_worker.go:728] disk inode is not compliant
E0319 22:52:14.455917 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 22:52:14.455925 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 22:52:14.455932 543705 custom_config.go:64] query custom config with name: gpu
I0319 22:52:14.456558 543705 disk_worker.go:494] system disk:vda1
I0319 22:52:14.456596 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 22:52:15.456832 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 22:52:15.456840 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:52:16.457928 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 22:52:16.457927 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 22:52:16.457982 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:52:16.458001 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:52:16.472433 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:52:21.397675 543705 disk_info.go:125] begin check local disk info of client
I0319 22:52:21.400090 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:52:21.400095 543705 disk_info.go:196] parse disk info done, disk is : [0xc000343140 0xc000343180]
E0319 22:52:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:52:23.409791 543705 memory.go:184] no items to output this cycle
I0319 22:52:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 22:52:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:52:33.409805 543705 memory.go:184] no items to output this cycle
I0319 22:52:33.409819 543705 cpu.go:275] no items to output this cycle
E0319 22:52:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:52:43.409807 543705 memory.go:191] Add success.
I0319 22:52:43.409809 543705 cpu.go:282] Add success.
I0319 22:52:43.420035 543705 net.go:648] Add success.
I0319 22:52:43.422992 543705 net.go:770] primary dev: ETH0
I0319 22:52:43.423007 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:52:43.423022 543705 net.go:698] Add success.
I0319 22:52:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:52:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:52:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:52:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:52:53.409778 543705 memory.go:184] no items to output this cycle
I0319 22:52:53.409788 543705 cpu.go:275] no items to output this cycle
E0319 22:53:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:53:03.409808 543705 memory.go:184] no items to output this cycle
I0319 22:53:03.409823 543705 cpu.go:275] no items to output this cycle
E0319 22:53:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:53:13.409792 543705 memory.go:191] Add success.
I0319 22:53:13.409795 543705 cpu.go:282] Add success.
W0319 22:53:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:53:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:53:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:53:13.420177 543705 net.go:648] Add success.
I0319 22:53:13.422831 543705 net.go:770] primary dev: ETH0
I0319 22:53:13.422846 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:53:13.422860 543705 net.go:698] Add success.
I0319 22:53:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:53:14.455130 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:53:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0319 22:53:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:53:14.456576 543705 disk_worker.go:494] system disk:vda1
I0319 22:53:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:53:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:53:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:53:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:53:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:53:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:53:21.401677 543705 disk_info.go:125] begin check local disk info of client
I0319 22:53:21.404084 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:53:21.404091 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034f580 0xc00034f5c0]
E0319 22:53:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:53:23.409789 543705 memory.go:184] no items to output this cycle
I0319 22:53:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 22:53:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:53:33.409872 543705 cpu.go:275] no items to output this cycle
I0319 22:53:33.409898 543705 memory.go:184] no items to output this cycle
E0319 22:53:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:53:43.409803 543705 memory.go:191] Add success.
I0319 22:53:43.409806 543705 cpu.go:282] Add success.
I0319 22:53:43.419912 543705 net.go:648] Add success.
I0319 22:53:43.422564 543705 net.go:770] primary dev: ETH0
I0319 22:53:43.422578 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:53:43.422590 543705 net.go:698] Add success.
I0319 22:53:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:53:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:53:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:53:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:53:53.409768 543705 memory.go:184] no items to output this cycle
I0319 22:53:53.409794 543705 cpu.go:275] no items to output this cycle
E0319 22:54:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:54:03.409804 543705 memory.go:184] no items to output this cycle
I0319 22:54:03.409819 543705 cpu.go:275] no items to output this cycle
E0319 22:54:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:54:13.409788 543705 memory.go:191] Add success.
W0319 22:54:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 22:54:13.409817 543705 cpu.go:282] Add success.
W0319 22:54:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:54:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:54:13.420492 543705 net.go:648] Add success.
I0319 22:54:13.423050 543705 net.go:770] primary dev: ETH0
I0319 22:54:13.423063 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:54:13.423075 543705 net.go:698] Add success.
I0319 22:54:13.510968 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0b59467e-4d97-4016-8e43-288a50aee76e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 22:54:13.511002 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 22:54:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:54:14.455101 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:54:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0319 22:54:14.455179 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:54:14.456500 543705 disk_worker.go:494] system disk:vda1
I0319 22:54:14.456546 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:54:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:54:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:54:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:54:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:54:16.472411 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:54:21.405672 543705 disk_info.go:125] begin check local disk info of client
I0319 22:54:21.408075 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:54:21.408081 543705 disk_info.go:196] parse disk info done, disk is : [0xc00027ab00 0xc00027ab40]
E0319 22:54:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:54:23.409789 543705 memory.go:184] no items to output this cycle
I0319 22:54:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 22:54:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:54:33.409778 543705 memory.go:184] no items to output this cycle
I0319 22:54:33.409780 543705 cpu.go:275] no items to output this cycle
I0319 22:54:37.936386 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 22:54:37.936394 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 22:54:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:54:43.410658 543705 memory.go:191] Add success.
I0319 22:54:43.409825 543705 cpu.go:282] Add success.
I0319 22:54:43.420406 543705 net.go:648] Add success.
I0319 22:54:43.422965 543705 net.go:770] primary dev: ETH0
I0319 22:54:43.422977 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:54:43.422990 543705 net.go:698] Add success.
I0319 22:54:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:54:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:54:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:54:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:54:53.409784 543705 cpu.go:275] no items to output this cycle
I0319 22:54:53.409787 543705 memory.go:184] no items to output this cycle
E0319 22:55:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:55:03.409806 543705 memory.go:184] no items to output this cycle
I0319 22:55:03.409820 543705 cpu.go:275] no items to output this cycle
E0319 22:55:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:55:13.409778 543705 memory.go:191] Add success.
W0319 22:55:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 22:55:13.409809 543705 cpu.go:282] Add success.
W0319 22:55:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:55:13.409818 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:55:13.420133 543705 net.go:648] Add success.
I0319 22:55:13.422827 543705 net.go:770] primary dev: ETH0
I0319 22:55:13.422841 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:55:13.422855 543705 net.go:698] Add success.
I0319 22:55:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:55:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:55:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0319 22:55:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:55:14.456548 543705 disk_worker.go:494] system disk:vda1
I0319 22:55:14.456578 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:55:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:55:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:55:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:55:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:55:16.472408 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:55:21.409678 543705 disk_info.go:125] begin check local disk info of client
I0319 22:55:21.412081 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:55:21.412087 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002092c0 0xc000209300]
E0319 22:55:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:55:23.409789 543705 memory.go:184] no items to output this cycle
I0319 22:55:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 22:55:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:55:33.409788 543705 cpu.go:275] no items to output this cycle
I0319 22:55:33.409792 543705 memory.go:184] no items to output this cycle
E0319 22:55:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:55:43.409824 543705 memory.go:191] Add success.
I0319 22:55:43.409831 543705 cpu.go:282] Add success.
I0319 22:55:43.420217 543705 net.go:648] Add success.
I0319 22:55:43.423332 543705 net.go:770] primary dev: ETH0
I0319 22:55:43.423347 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:55:43.423361 543705 net.go:698] Add success.
I0319 22:55:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:55:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:55:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:55:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:55:53.409792 543705 memory.go:184] no items to output this cycle
I0319 22:55:53.409805 543705 cpu.go:275] no items to output this cycle
E0319 22:56:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:56:03.409792 543705 memory.go:184] no items to output this cycle
I0319 22:56:03.409795 543705 cpu.go:275] no items to output this cycle
E0319 22:56:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:56:13.409820 543705 memory.go:191] Add success.
I0319 22:56:13.409832 543705 cpu.go:282] Add success.
W0319 22:56:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:56:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:56:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:56:13.420393 543705 net.go:648] Add success.
I0319 22:56:13.423172 543705 net.go:770] primary dev: ETH0
I0319 22:56:13.423185 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:56:13.423196 543705 net.go:698] Add success.
I0319 22:56:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:56:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:56:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0319 22:56:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:56:14.456579 543705 disk_worker.go:494] system disk:vda1
I0319 22:56:14.456608 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:56:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:56:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:56:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:56:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:56:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:56:21.412798 543705 disk_info.go:125] begin check local disk info of client
I0319 22:56:21.415234 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:56:21.415240 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039e140 0xc00039e180]
E0319 22:56:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:56:23.409793 543705 memory.go:184] no items to output this cycle
I0319 22:56:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 22:56:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:56:33.409797 543705 memory.go:184] no items to output this cycle
I0319 22:56:33.409810 543705 cpu.go:275] no items to output this cycle
E0319 22:56:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:56:43.409923 543705 memory.go:191] Add success.
I0319 22:56:43.409924 543705 cpu.go:282] Add success.
I0319 22:56:43.419781 543705 net.go:648] Add success.
I0319 22:56:43.422734 543705 net.go:770] primary dev: ETH0
I0319 22:56:43.422748 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:56:43.422762 543705 net.go:698] Add success.
I0319 22:56:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:56:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:56:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:56:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:56:53.409783 543705 memory.go:184] no items to output this cycle
I0319 22:56:53.409787 543705 cpu.go:275] no items to output this cycle
E0319 22:57:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:57:03.409808 543705 memory.go:184] no items to output this cycle
I0319 22:57:03.409822 543705 cpu.go:275] no items to output this cycle
E0319 22:57:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:57:13.409787 543705 memory.go:191] Add success.
I0319 22:57:13.409792 543705 cpu.go:282] Add success.
W0319 22:57:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:57:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:57:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:57:13.420103 543705 net.go:648] Add success.
I0319 22:57:13.423202 543705 net.go:770] primary dev: ETH0
I0319 22:57:13.423215 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:57:13.423227 543705 net.go:698] Add success.
I0319 22:57:13.429816 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 22:57:13.452983 543705 event_worker.go:152] Polling the log file for events...
I0319 22:57:13.468069 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f4b30bc4-b04b-4379-a27a-1e9b6876f751","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 22:57:13.468105 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 22:57:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:57:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0319 22:57:14.455178 543705 disk_worker.go:728] disk inode is not compliant
E0319 22:57:14.456800 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 22:57:14.456808 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 22:57:14.456814 543705 custom_config.go:64] query custom config with name: gpu
I0319 22:57:14.456842 543705 disk_worker.go:494] system disk:vda1
I0319 22:57:14.456868 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 22:57:15.456847 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 22:57:15.456856 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:57:16.457962 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 22:57:16.457970 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 22:57:16.458014 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:57:16.458031 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:57:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:57:21.415795 543705 disk_info.go:125] begin check local disk info of client
I0319 22:57:21.418285 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:57:21.418291 543705 disk_info.go:196] parse disk info done, disk is : [0xc000265300 0xc000265340]
E0319 22:57:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:57:23.409877 543705 cpu.go:275] no items to output this cycle
I0319 22:57:23.409888 543705 memory.go:184] no items to output this cycle
E0319 22:57:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:57:33.409780 543705 memory.go:184] no items to output this cycle
I0319 22:57:33.409803 543705 cpu.go:275] no items to output this cycle
I0319 22:57:37.937748 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 22:57:37.937756 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 22:57:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:57:43.410592 543705 memory.go:191] Add success.
I0319 22:57:43.409831 543705 cpu.go:282] Add success.
I0319 22:57:43.420351 543705 net.go:648] Add success.
I0319 22:57:43.423054 543705 net.go:770] primary dev: ETH0
I0319 22:57:43.423067 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:57:43.423080 543705 net.go:698] Add success.
I0319 22:57:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:57:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:57:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:57:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:57:53.409793 543705 memory.go:184] no items to output this cycle
I0319 22:57:53.409806 543705 cpu.go:275] no items to output this cycle
E0319 22:58:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:58:03.409782 543705 memory.go:184] no items to output this cycle
I0319 22:58:03.409787 543705 cpu.go:275] no items to output this cycle
E0319 22:58:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:58:13.409798 543705 cpu.go:282] Add success.
I0319 22:58:13.409806 543705 memory.go:191] Add success.
W0319 22:58:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:58:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:58:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:58:13.420076 543705 net.go:648] Add success.
I0319 22:58:13.422924 543705 net.go:770] primary dev: ETH0
I0319 22:58:13.422938 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:58:13.422953 543705 net.go:698] Add success.
I0319 22:58:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:58:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:58:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0319 22:58:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:58:14.456584 543705 disk_worker.go:494] system disk:vda1
I0319 22:58:14.456615 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:58:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:58:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:58:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:58:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:58:16.472469 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:58:21.418804 543705 disk_info.go:125] begin check local disk info of client
I0319 22:58:21.421270 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:58:21.421276 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a2340 0xc0002a2380]
E0319 22:58:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:58:23.409775 543705 memory.go:184] no items to output this cycle
I0319 22:58:23.409778 543705 cpu.go:275] no items to output this cycle
E0319 22:58:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:58:33.409773 543705 memory.go:184] no items to output this cycle
I0319 22:58:33.409791 543705 cpu.go:275] no items to output this cycle
E0319 22:58:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:58:43.409802 543705 memory.go:191] Add success.
I0319 22:58:43.409805 543705 cpu.go:282] Add success.
I0319 22:58:43.419879 543705 net.go:648] Add success.
I0319 22:58:43.422680 543705 net.go:770] primary dev: ETH0
I0319 22:58:43.422694 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:58:43.422707 543705 net.go:698] Add success.
I0319 22:58:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:58:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:58:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:58:53.410377 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:58:53.410399 543705 memory.go:184] no items to output this cycle
I0319 22:58:53.410411 543705 cpu.go:275] no items to output this cycle
E0319 22:59:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:59:03.409809 543705 memory.go:184] no items to output this cycle
I0319 22:59:03.409828 543705 cpu.go:275] no items to output this cycle
E0319 22:59:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:59:13.409781 543705 memory.go:191] Add success.
I0319 22:59:13.409800 543705 cpu.go:282] Add success.
W0319 22:59:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 22:59:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 22:59:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 22:59:13.420113 543705 net.go:648] Add success.
I0319 22:59:13.423276 543705 net.go:770] primary dev: ETH0
I0319 22:59:13.423291 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:59:13.423302 543705 net.go:698] Add success.
I0319 22:59:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0319 22:59:14.455147 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 22:59:14.455157 543705 disk_worker.go:708] disk space is not compliant
W0319 22:59:14.455159 543705 disk_worker.go:728] disk inode is not compliant
I0319 22:59:14.456486 543705 disk_worker.go:494] system disk:vda1
I0319 22:59:14.456530 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 22:59:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 22:59:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:59:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:59:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 22:59:16.472366 543705 disk_local_worker.go:436] Get disk info: []
I0319 22:59:21.421809 543705 disk_info.go:125] begin check local disk info of client
I0319 22:59:21.424234 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 22:59:21.424241 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b0240 0xc0003b0280]
E0319 22:59:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:59:23.409764 543705 memory.go:184] no items to output this cycle
I0319 22:59:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 22:59:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:59:33.409807 543705 memory.go:184] no items to output this cycle
I0319 22:59:33.409821 543705 cpu.go:275] no items to output this cycle
E0319 22:59:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:59:43.409789 543705 memory.go:191] Add success.
I0319 22:59:43.409815 543705 cpu.go:282] Add success.
I0319 22:59:43.420414 543705 net.go:648] Add success.
I0319 22:59:43.423557 543705 net.go:770] primary dev: ETH0
I0319 22:59:43.423571 543705 net.go:802] Send network stats successfully!,count is 6
I0319 22:59:43.423584 543705 net.go:698] Add success.
I0319 22:59:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 22:59:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 22:59:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 22:59:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 22:59:53.409793 543705 memory.go:184] no items to output this cycle
I0319 22:59:53.409800 543705 cpu.go:275] no items to output this cycle
E0319 23:00:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:00:03.409782 543705 memory.go:184] no items to output this cycle
I0319 23:00:03.409785 543705 cpu.go:275] no items to output this cycle
E0319 23:00:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:00:13.409791 543705 memory.go:191] Add success.
I0319 23:00:13.409794 543705 cpu.go:282] Add success.
W0319 23:00:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:00:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:00:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:00:13.420079 543705 net.go:648] Add success.
I0319 23:00:13.422774 543705 net.go:770] primary dev: ETH0
I0319 23:00:13.422789 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:00:13.422801 543705 net.go:698] Add success.
I0319 23:00:13.468952 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"cbd76e31-bcf3-42bf-8479-5018f0e6554d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 23:00:13.468993 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 23:00:14.454990 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:00:14.455386 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:00:14.455396 543705 disk_worker.go:708] disk space is not compliant
W0319 23:00:14.455399 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:00:14.457547 543705 disk_worker.go:494] system disk:vda1
I0319 23:00:14.457575 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:00:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:00:16.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:00:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:00:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:00:16.472397 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:00:21.424811 543705 disk_info.go:125] begin check local disk info of client
I0319 23:00:21.427209 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:00:21.427215 543705 disk_info.go:196] parse disk info done, disk is : [0xc000365340 0xc000365380]
E0319 23:00:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:00:23.409791 543705 memory.go:184] no items to output this cycle
I0319 23:00:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 23:00:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:00:33.409776 543705 memory.go:184] no items to output this cycle
I0319 23:00:33.409777 543705 cpu.go:275] no items to output this cycle
I0319 23:00:37.938019 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 23:00:37.938027 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 23:00:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:00:43.410596 543705 memory.go:191] Add success.
I0319 23:00:43.409809 543705 cpu.go:282] Add success.
I0319 23:00:43.420290 543705 net.go:648] Add success.
I0319 23:00:43.422635 543705 net.go:770] primary dev: ETH0
I0319 23:00:43.422650 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:00:43.422664 543705 net.go:698] Add success.
I0319 23:00:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:00:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:00:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:00:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:00:53.409783 543705 memory.go:184] no items to output this cycle
I0319 23:00:53.409786 543705 cpu.go:275] no items to output this cycle
E0319 23:01:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:01:03.409813 543705 memory.go:184] no items to output this cycle
I0319 23:01:03.409818 543705 cpu.go:275] no items to output this cycle
E0319 23:01:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:01:13.409831 543705 memory.go:191] Add success.
I0319 23:01:13.409834 543705 cpu.go:282] Add success.
W0319 23:01:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:01:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:01:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:01:13.420325 543705 net.go:648] Add success.
I0319 23:01:13.422999 543705 net.go:770] primary dev: ETH0
I0319 23:01:13.423012 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:01:13.423024 543705 net.go:698] Add success.
I0319 23:01:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:01:14.455187 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:01:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0319 23:01:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:01:14.456566 543705 disk_worker.go:494] system disk:vda1
I0319 23:01:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:01:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:01:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:01:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:01:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:01:16.472383 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:01:21.427824 543705 disk_info.go:125] begin check local disk info of client
I0319 23:01:21.430271 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:01:21.430277 543705 disk_info.go:196] parse disk info done, disk is : [0xc000364600 0xc000364640]
E0319 23:01:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:01:23.409798 543705 memory.go:184] no items to output this cycle
I0319 23:01:23.409810 543705 cpu.go:275] no items to output this cycle
E0319 23:01:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:01:33.409770 543705 memory.go:184] no items to output this cycle
I0319 23:01:33.409793 543705 cpu.go:275] no items to output this cycle
E0319 23:01:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:01:43.409825 543705 memory.go:191] Add success.
I0319 23:01:43.409829 543705 cpu.go:282] Add success.
I0319 23:01:43.420099 543705 net.go:648] Add success.
I0319 23:01:43.423276 543705 net.go:770] primary dev: ETH0
I0319 23:01:43.423289 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:01:43.423301 543705 net.go:698] Add success.
I0319 23:01:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:01:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:01:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:01:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:01:53.409801 543705 memory.go:184] no items to output this cycle
I0319 23:01:53.409812 543705 cpu.go:275] no items to output this cycle
E0319 23:02:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:02:03.409771 543705 memory.go:184] no items to output this cycle
I0319 23:02:03.409796 543705 cpu.go:275] no items to output this cycle
E0319 23:02:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:02:13.409796 543705 memory.go:191] Add success.
I0319 23:02:13.409799 543705 cpu.go:282] Add success.
W0319 23:02:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:02:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:02:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:02:13.419739 543705 net.go:648] Add success.
I0319 23:02:13.422460 543705 net.go:770] primary dev: ETH0
I0319 23:02:13.422475 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:02:13.422488 543705 net.go:698] Add success.
W0319 23:02:14.455089 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:02:14.455145 543705 disk_worker.go:708] disk space is not compliant
W0319 23:02:14.455148 543705 disk_worker.go:728] disk inode is not compliant
E0319 23:02:14.456895 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 23:02:14.456904 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 23:02:14.456910 543705 custom_config.go:64] query custom config with name: gpu
I0319 23:02:14.456981 543705 disk_worker.go:494] system disk:vda1
I0319 23:02:14.457021 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 23:02:15.456829 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 23:02:15.456837 543705 custom_config.go:64] query custom config with name: huawei_npu
E0319 23:02:16.457961 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 23:02:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:02:16.458019 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:02:16.458038 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:02:16.472386 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:02:21.430844 543705 disk_info.go:125] begin check local disk info of client
I0319 23:02:21.433288 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:02:21.433295 543705 disk_info.go:196] parse disk info done, disk is : [0xc00027a600 0xc00027a640]
E0319 23:02:23.409742 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:02:23.409757 543705 memory.go:184] no items to output this cycle
I0319 23:02:23.409796 543705 cpu.go:275] no items to output this cycle
E0319 23:02:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:02:33.409795 543705 memory.go:184] no items to output this cycle
I0319 23:02:33.409809 543705 cpu.go:275] no items to output this cycle
E0319 23:02:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:02:43.409823 543705 memory.go:191] Add success.
I0319 23:02:43.409826 543705 cpu.go:282] Add success.
I0319 23:02:43.420005 543705 net.go:648] Add success.
I0319 23:02:43.422700 543705 net.go:770] primary dev: ETH0
I0319 23:02:43.422713 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:02:43.422725 543705 net.go:698] Add success.
I0319 23:02:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:02:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:02:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:02:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:02:53.409785 543705 memory.go:184] no items to output this cycle
I0319 23:02:53.409788 543705 cpu.go:275] no items to output this cycle
E0319 23:03:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:03:03.409806 543705 memory.go:184] no items to output this cycle
I0319 23:03:03.409822 543705 cpu.go:275] no items to output this cycle
E0319 23:03:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:03:13.409774 543705 memory.go:191] Add success.
W0319 23:03:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 23:03:13.409807 543705 cpu.go:282] Add success.
W0319 23:03:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:03:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:03:13.419726 543705 net.go:648] Add success.
I0319 23:03:13.422501 543705 net.go:770] primary dev: ETH0
I0319 23:03:13.422513 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:03:13.422524 543705 net.go:698] Add success.
I0319 23:03:13.468668 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"765c46a8-3699-4c1e-a89d-3013c6db3250","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 23:03:13.468698 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 23:03:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:03:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:03:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0319 23:03:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:03:14.456689 543705 disk_worker.go:494] system disk:vda1
I0319 23:03:14.456722 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:03:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:03:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:03:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:03:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:03:16.472403 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:03:21.433868 543705 disk_info.go:125] begin check local disk info of client
I0319 23:03:21.436476 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:03:21.436482 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004bf900 0xc0004bf940]
E0319 23:03:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:03:23.409790 543705 memory.go:184] no items to output this cycle
I0319 23:03:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 23:03:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:03:33.409772 543705 memory.go:184] no items to output this cycle
I0319 23:03:33.409809 543705 cpu.go:275] no items to output this cycle
I0319 23:03:37.940415 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 23:03:37.940423 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 23:03:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:03:43.410660 543705 memory.go:191] Add success.
I0319 23:03:43.409805 543705 cpu.go:282] Add success.
I0319 23:03:43.420388 543705 net.go:648] Add success.
I0319 23:03:43.423146 543705 net.go:770] primary dev: ETH0
I0319 23:03:43.423162 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:03:43.423177 543705 net.go:698] Add success.
I0319 23:03:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:03:46.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:03:46.458056 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:03:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:03:53.409773 543705 memory.go:184] no items to output this cycle
I0319 23:03:53.409793 543705 cpu.go:275] no items to output this cycle
E0319 23:04:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:04:03.409806 543705 memory.go:184] no items to output this cycle
I0319 23:04:03.409815 543705 cpu.go:275] no items to output this cycle
E0319 23:04:13.409899 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:04:13.409928 543705 memory.go:191] Add success.
I0319 23:04:13.410012 543705 cpu.go:282] Add success.
W0319 23:04:13.409956 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:04:13.410160 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:04:13.410171 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:04:13.419714 543705 net.go:648] Add success.
I0319 23:04:13.422333 543705 net.go:770] primary dev: ETH0
I0319 23:04:13.422349 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:04:13.422362 543705 net.go:698] Add success.
I0319 23:04:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:04:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:04:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0319 23:04:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:04:14.456576 543705 disk_worker.go:494] system disk:vda1
I0319 23:04:14.456605 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:04:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:04:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:04:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:04:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:04:16.472486 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:04:21.436878 543705 disk_info.go:125] begin check local disk info of client
I0319 23:04:21.439300 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:04:21.439306 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004988c0 0xc000498900]
E0319 23:04:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:04:23.409788 543705 memory.go:184] no items to output this cycle
I0319 23:04:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 23:04:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:04:33.409784 543705 memory.go:184] no items to output this cycle
I0319 23:04:33.409785 543705 cpu.go:275] no items to output this cycle
E0319 23:04:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:04:43.409800 543705 memory.go:191] Add success.
I0319 23:04:43.409799 543705 cpu.go:282] Add success.
I0319 23:04:43.419876 543705 net.go:648] Add success.
I0319 23:04:43.422523 543705 net.go:770] primary dev: ETH0
I0319 23:04:43.422539 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:04:43.422554 543705 net.go:698] Add success.
I0319 23:04:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:04:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:04:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:04:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:04:53.409780 543705 memory.go:184] no items to output this cycle
I0319 23:04:53.409799 543705 cpu.go:275] no items to output this cycle
E0319 23:05:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:05:03.409806 543705 memory.go:184] no items to output this cycle
I0319 23:05:03.409820 543705 cpu.go:275] no items to output this cycle
E0319 23:05:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:05:13.409811 543705 memory.go:191] Add success.
I0319 23:05:13.409820 543705 cpu.go:282] Add success.
W0319 23:05:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:05:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:05:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:05:13.420303 543705 net.go:648] Add success.
I0319 23:05:13.423121 543705 net.go:770] primary dev: ETH0
I0319 23:05:13.423136 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:05:13.423150 543705 net.go:698] Add success.
I0319 23:05:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:05:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:05:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0319 23:05:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:05:14.456558 543705 disk_worker.go:494] system disk:vda1
I0319 23:05:14.456590 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:05:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:05:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:05:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:05:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:05:16.472448 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:05:21.439892 543705 disk_info.go:125] begin check local disk info of client
I0319 23:05:21.442320 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:05:21.442326 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c41c0 0xc0000c4200]
E0319 23:05:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:05:23.409789 543705 memory.go:184] no items to output this cycle
I0319 23:05:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 23:05:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:05:33.409799 543705 memory.go:184] no items to output this cycle
I0319 23:05:33.409813 543705 cpu.go:275] no items to output this cycle
E0319 23:05:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:05:43.409823 543705 memory.go:191] Add success.
I0319 23:05:43.409827 543705 cpu.go:282] Add success.
I0319 23:05:43.419978 543705 net.go:648] Add success.
I0319 23:05:43.422501 543705 net.go:770] primary dev: ETH0
I0319 23:05:43.422514 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:05:43.422526 543705 net.go:698] Add success.
I0319 23:05:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:05:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:05:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:05:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:05:53.409768 543705 memory.go:184] no items to output this cycle
I0319 23:05:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 23:06:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:06:03.409775 543705 memory.go:184] no items to output this cycle
I0319 23:06:03.409783 543705 cpu.go:275] no items to output this cycle
E0319 23:06:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:06:13.409823 543705 memory.go:191] Add success.
I0319 23:06:13.409832 543705 cpu.go:282] Add success.
W0319 23:06:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:06:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:06:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:06:13.420141 543705 net.go:648] Add success.
I0319 23:06:13.422791 543705 net.go:770] primary dev: ETH0
I0319 23:06:13.422804 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:06:13.422815 543705 net.go:698] Add success.
I0319 23:06:13.534743 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9711a74f-25b2-4be5-991d-0a9f1b37960f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 23:06:13.534776 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 23:06:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:06:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:06:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0319 23:06:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:06:14.456732 543705 disk_worker.go:494] system disk:vda1
I0319 23:06:14.456761 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:06:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:06:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:06:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:06:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:06:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:06:21.442907 543705 disk_info.go:125] begin check local disk info of client
I0319 23:06:21.445344 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:06:21.445350 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004ebdc0 0xc0004ebe00]
E0319 23:06:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:06:23.409787 543705 memory.go:184] no items to output this cycle
I0319 23:06:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 23:06:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:06:33.409796 543705 memory.go:184] no items to output this cycle
I0319 23:06:33.409808 543705 cpu.go:275] no items to output this cycle
I0319 23:06:37.941759 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 23:06:37.941768 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 23:06:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:06:43.410792 543705 memory.go:191] Add success.
I0319 23:06:43.409805 543705 cpu.go:282] Add success.
I0319 23:06:43.420569 543705 net.go:648] Add success.
I0319 23:06:43.423613 543705 net.go:770] primary dev: ETH0
I0319 23:06:43.423627 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:06:43.423654 543705 net.go:698] Add success.
I0319 23:06:46.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:06:46.458066 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:06:46.458094 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:06:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:06:53.409791 543705 memory.go:184] no items to output this cycle
I0319 23:06:53.409809 543705 cpu.go:275] no items to output this cycle
E0319 23:07:03.409899 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:07:03.409922 543705 memory.go:184] no items to output this cycle
I0319 23:07:03.409928 543705 cpu.go:275] no items to output this cycle
E0319 23:07:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:07:13.409822 543705 memory.go:191] Add success.
I0319 23:07:13.409837 543705 cpu.go:282] Add success.
W0319 23:07:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:07:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:07:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:07:13.420194 543705 net.go:648] Add success.
I0319 23:07:13.422869 543705 net.go:770] primary dev: ETH0
I0319 23:07:13.422882 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:07:13.422894 543705 net.go:698] Add success.
I0319 23:07:13.453565 543705 event_worker.go:152] Polling the log file for events...
W0319 23:07:14.455174 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:07:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0319 23:07:14.455188 543705 disk_worker.go:728] disk inode is not compliant
E0319 23:07:14.455900 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 23:07:14.455909 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 23:07:14.455914 543705 custom_config.go:64] query custom config with name: gpu
I0319 23:07:14.456545 543705 disk_worker.go:494] system disk:vda1
I0319 23:07:14.456573 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 23:07:15.456816 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 23:07:15.456827 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:07:16.457899 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 23:07:16.457897 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 23:07:16.457954 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:07:16.457974 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:07:16.472353 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:07:21.445913 543705 disk_info.go:125] begin check local disk info of client
I0319 23:07:21.448322 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:07:21.448329 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004ebbc0 0xc0004ebc00]
E0319 23:07:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:07:23.409796 543705 memory.go:184] no items to output this cycle
I0319 23:07:23.409811 543705 cpu.go:275] no items to output this cycle
E0319 23:07:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:07:33.409814 543705 memory.go:184] no items to output this cycle
I0319 23:07:33.409827 543705 cpu.go:275] no items to output this cycle
E0319 23:07:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:07:43.409803 543705 memory.go:191] Add success.
I0319 23:07:43.409822 543705 cpu.go:282] Add success.
I0319 23:07:43.420008 543705 net.go:648] Add success.
I0319 23:07:43.422684 543705 net.go:770] primary dev: ETH0
I0319 23:07:43.422697 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:07:43.422711 543705 net.go:698] Add success.
I0319 23:07:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:07:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:07:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:07:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:07:53.409787 543705 memory.go:184] no items to output this cycle
I0319 23:07:53.409816 543705 cpu.go:275] no items to output this cycle
E0319 23:08:03.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:08:03.409818 543705 memory.go:184] no items to output this cycle
I0319 23:08:03.409832 543705 cpu.go:275] no items to output this cycle
E0319 23:08:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:08:13.409789 543705 memory.go:191] Add success.
I0319 23:08:13.409791 543705 cpu.go:282] Add success.
W0319 23:08:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:08:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:08:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:08:13.420184 543705 net.go:648] Add success.
I0319 23:08:13.423090 543705 net.go:770] primary dev: ETH0
I0319 23:08:13.423103 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:08:13.423116 543705 net.go:698] Add success.
I0319 23:08:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:08:14.455095 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:08:14.455159 543705 disk_worker.go:708] disk space is not compliant
W0319 23:08:14.455162 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:08:14.456484 543705 disk_worker.go:494] system disk:vda1
I0319 23:08:14.456527 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:08:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:08:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:08:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:08:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:08:16.472375 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:08:21.448936 543705 disk_info.go:125] begin check local disk info of client
I0319 23:08:21.451380 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:08:21.451386 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329040 0xc000329080]
E0319 23:08:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:08:23.409795 543705 memory.go:184] no items to output this cycle
I0319 23:08:23.409809 543705 cpu.go:275] no items to output this cycle
E0319 23:08:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:08:33.409805 543705 memory.go:184] no items to output this cycle
I0319 23:08:33.409810 543705 cpu.go:275] no items to output this cycle
E0319 23:08:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:08:43.409808 543705 memory.go:191] Add success.
I0319 23:08:43.409808 543705 cpu.go:282] Add success.
I0319 23:08:43.419963 543705 net.go:648] Add success.
I0319 23:08:43.422604 543705 net.go:770] primary dev: ETH0
I0319 23:08:43.422618 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:08:43.422630 543705 net.go:698] Add success.
I0319 23:08:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:08:46.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:08:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:08:53.410238 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:08:53.410257 543705 memory.go:184] no items to output this cycle
I0319 23:08:53.410262 543705 cpu.go:275] no items to output this cycle
E0319 23:09:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:09:03.409787 543705 memory.go:184] no items to output this cycle
I0319 23:09:03.409799 543705 cpu.go:275] no items to output this cycle
E0319 23:09:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:09:13.409794 543705 memory.go:191] Add success.
I0319 23:09:13.409795 543705 cpu.go:282] Add success.
W0319 23:09:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:09:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:09:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:09:13.420133 543705 net.go:648] Add success.
I0319 23:09:13.422929 543705 net.go:770] primary dev: ETH0
I0319 23:09:13.422942 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:09:13.422953 543705 net.go:698] Add success.
I0319 23:09:13.469429 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5be7790f-f9ae-4a03-89e1-e9172625425f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 23:09:13.469461 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 23:09:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:09:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:09:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0319 23:09:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:09:14.456666 543705 disk_worker.go:494] system disk:vda1
I0319 23:09:14.456696 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:09:15.455616 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:09:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:09:16.458025 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:09:16.458045 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:09:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:09:21.451948 543705 disk_info.go:125] begin check local disk info of client
I0319 23:09:21.454392 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:09:21.454398 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034ce40 0xc00034ce80]
E0319 23:09:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:09:23.409772 543705 memory.go:184] no items to output this cycle
I0319 23:09:23.409784 543705 cpu.go:275] no items to output this cycle
E0319 23:09:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:09:33.409767 543705 memory.go:184] no items to output this cycle
I0319 23:09:33.409797 543705 cpu.go:275] no items to output this cycle
I0319 23:09:37.944433 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 23:09:37.944443 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 23:09:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:09:43.410992 543705 memory.go:191] Add success.
I0319 23:09:43.409831 543705 cpu.go:282] Add success.
I0319 23:09:43.420703 543705 net.go:648] Add success.
I0319 23:09:43.423603 543705 net.go:770] primary dev: ETH0
I0319 23:09:43.423615 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:09:43.423628 543705 net.go:698] Add success.
I0319 23:09:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:09:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:09:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:09:53.410223 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:09:53.410241 543705 memory.go:184] no items to output this cycle
I0319 23:09:53.410252 543705 cpu.go:275] no items to output this cycle
E0319 23:10:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:10:03.409799 543705 memory.go:184] no items to output this cycle
I0319 23:10:03.409813 543705 cpu.go:275] no items to output this cycle
E0319 23:10:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:10:13.409795 543705 memory.go:191] Add success.
I0319 23:10:13.409796 543705 cpu.go:282] Add success.
W0319 23:10:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:10:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:10:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:10:13.420150 543705 net.go:648] Add success.
I0319 23:10:13.423035 543705 net.go:770] primary dev: ETH0
I0319 23:10:13.423051 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:10:13.423065 543705 net.go:698] Add success.
I0319 23:10:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:10:14.455130 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:10:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0319 23:10:14.455214 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:10:14.456597 543705 disk_worker.go:494] system disk:vda1
I0319 23:10:14.456627 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:10:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:10:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:10:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:10:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:10:16.472449 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:10:21.454965 543705 disk_info.go:125] begin check local disk info of client
I0319 23:10:21.457342 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:10:21.457348 543705 disk_info.go:196] parse disk info done, disk is : [0xc000328900 0xc000328940]
E0319 23:10:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:10:23.409788 543705 memory.go:184] no items to output this cycle
I0319 23:10:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 23:10:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:10:33.409779 543705 memory.go:184] no items to output this cycle
I0319 23:10:33.409782 543705 cpu.go:275] no items to output this cycle
E0319 23:10:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:10:43.409802 543705 memory.go:191] Add success.
I0319 23:10:43.409804 543705 cpu.go:282] Add success.
I0319 23:10:43.420032 543705 net.go:648] Add success.
I0319 23:10:43.423280 543705 net.go:770] primary dev: ETH0
I0319 23:10:43.423295 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:10:43.423309 543705 net.go:698] Add success.
I0319 23:10:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:10:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:10:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:10:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:10:53.409797 543705 memory.go:184] no items to output this cycle
I0319 23:10:53.409812 543705 cpu.go:275] no items to output this cycle
E0319 23:11:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:11:03.409779 543705 memory.go:184] no items to output this cycle
I0319 23:11:03.409799 543705 cpu.go:275] no items to output this cycle
E0319 23:11:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:11:13.409797 543705 memory.go:191] Add success.
I0319 23:11:13.409800 543705 cpu.go:282] Add success.
W0319 23:11:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:11:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:11:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:11:13.420243 543705 net.go:648] Add success.
I0319 23:11:13.423023 543705 net.go:770] primary dev: ETH0
I0319 23:11:13.423038 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:11:13.423052 543705 net.go:698] Add success.
I0319 23:11:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:11:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:11:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0319 23:11:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:11:14.456570 543705 disk_worker.go:494] system disk:vda1
I0319 23:11:14.456597 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:11:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:11:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:11:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:11:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:11:16.472393 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:11:21.457981 543705 disk_info.go:125] begin check local disk info of client
I0319 23:11:21.460464 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:11:21.460471 543705 disk_info.go:196] parse disk info done, disk is : [0xc000328c00 0xc000328c40]
E0319 23:11:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:11:23.409762 543705 memory.go:184] no items to output this cycle
I0319 23:11:23.409796 543705 cpu.go:275] no items to output this cycle
E0319 23:11:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:11:33.409781 543705 memory.go:184] no items to output this cycle
I0319 23:11:33.409785 543705 cpu.go:275] no items to output this cycle
E0319 23:11:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:11:43.409799 543705 memory.go:191] Add success.
I0319 23:11:43.409802 543705 cpu.go:282] Add success.
I0319 23:11:43.419864 543705 net.go:648] Add success.
I0319 23:11:43.423017 543705 net.go:770] primary dev: ETH0
I0319 23:11:43.423032 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:11:43.423043 543705 net.go:698] Add success.
I0319 23:11:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:11:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:11:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:11:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:11:53.409774 543705 memory.go:184] no items to output this cycle
I0319 23:11:53.409778 543705 cpu.go:275] no items to output this cycle
E0319 23:12:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:12:03.409776 543705 memory.go:184] no items to output this cycle
I0319 23:12:03.409801 543705 cpu.go:275] no items to output this cycle
E0319 23:12:13.409871 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:12:13.409905 543705 memory.go:191] Add success.
W0319 23:12:13.409937 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:12:13.409950 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:12:13.409956 543705 cpu.go:282] Add success.
I0319 23:12:13.409960 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:12:13.419708 543705 net.go:648] Add success.
I0319 23:12:13.422323 543705 net.go:770] primary dev: ETH0
I0319 23:12:13.422336 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:12:13.422347 543705 net.go:698] Add success.
I0319 23:12:13.463250 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"278fcee4-44ed-4d63-8740-3a62268e2d7b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 23:12:13.463281 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 23:12:14.455095 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:12:14.455155 543705 disk_worker.go:708] disk space is not compliant
W0319 23:12:14.455158 543705 disk_worker.go:728] disk inode is not compliant
E0319 23:12:14.456988 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 23:12:14.456997 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 23:12:14.457002 543705 custom_config.go:64] query custom config with name: gpu
I0319 23:12:14.457023 543705 disk_worker.go:494] system disk:vda1
I0319 23:12:14.457061 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 23:12:15.456497 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 23:12:15.456505 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:12:16.457949 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 23:12:16.457959 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 23:12:16.458001 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:12:16.458018 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:12:16.472345 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:12:21.460987 543705 disk_info.go:125] begin check local disk info of client
I0319 23:12:21.463454 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:12:21.463462 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aac40 0xc0001aac80]
E0319 23:12:23.409742 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:12:23.409758 543705 memory.go:184] no items to output this cycle
I0319 23:12:23.409788 543705 cpu.go:275] no items to output this cycle
E0319 23:12:33.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:12:33.409763 543705 memory.go:184] no items to output this cycle
I0319 23:12:33.409786 543705 cpu.go:275] no items to output this cycle
I0319 23:12:37.945736 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 23:12:37.945743 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 23:12:43.409846 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:12:43.410734 543705 memory.go:191] Add success.
I0319 23:12:43.409899 543705 cpu.go:282] Add success.
I0319 23:12:43.420437 543705 net.go:648] Add success.
I0319 23:12:43.423147 543705 net.go:770] primary dev: ETH0
I0319 23:12:43.423159 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:12:43.423174 543705 net.go:698] Add success.
I0319 23:12:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:12:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:12:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:12:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:12:53.409778 543705 memory.go:184] no items to output this cycle
I0319 23:12:53.409782 543705 cpu.go:275] no items to output this cycle
E0319 23:13:03.409850 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:13:03.409870 543705 memory.go:184] no items to output this cycle
I0319 23:13:03.409940 543705 cpu.go:275] no items to output this cycle
E0319 23:13:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:13:13.409813 543705 memory.go:191] Add success.
I0319 23:13:13.409822 543705 cpu.go:282] Add success.
W0319 23:13:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:13:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:13:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:13:13.420165 543705 net.go:648] Add success.
I0319 23:13:13.423121 543705 net.go:770] primary dev: ETH0
I0319 23:13:13.423134 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:13:13.423146 543705 net.go:698] Add success.
I0319 23:13:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:13:14.455154 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:13:14.455164 543705 disk_worker.go:708] disk space is not compliant
W0319 23:13:14.455166 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:13:14.456973 543705 disk_worker.go:494] system disk:vda1
I0319 23:13:14.457016 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:13:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:13:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:13:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:13:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:13:16.472405 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:13:21.464096 543705 disk_info.go:125] begin check local disk info of client
I0319 23:13:21.466514 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:13:21.466522 543705 disk_info.go:196] parse disk info done, disk is : [0xc000328400 0xc000328440]
E0319 23:13:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:13:23.409792 543705 memory.go:184] no items to output this cycle
I0319 23:13:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 23:13:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:13:33.409766 543705 memory.go:184] no items to output this cycle
I0319 23:13:33.409795 543705 cpu.go:275] no items to output this cycle
E0319 23:13:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:13:43.409817 543705 memory.go:191] Add success.
I0319 23:13:43.409826 543705 cpu.go:282] Add success.
I0319 23:13:43.419985 543705 net.go:648] Add success.
I0319 23:13:43.422524 543705 net.go:770] primary dev: ETH0
I0319 23:13:43.422538 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:13:43.422549 543705 net.go:698] Add success.
I0319 23:13:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:13:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:13:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:13:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:13:53.409778 543705 memory.go:184] no items to output this cycle
I0319 23:13:53.409777 543705 cpu.go:275] no items to output this cycle
E0319 23:14:03.409848 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:14:03.409866 543705 memory.go:184] no items to output this cycle
I0319 23:14:03.410018 543705 cpu.go:275] no items to output this cycle
E0319 23:14:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:14:13.409793 543705 memory.go:191] Add success.
I0319 23:14:13.409798 543705 cpu.go:282] Add success.
W0319 23:14:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:14:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:14:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:14:13.420272 543705 net.go:648] Add success.
I0319 23:14:13.422734 543705 net.go:770] primary dev: ETH0
I0319 23:14:13.422747 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:14:13.422759 543705 net.go:698] Add success.
I0319 23:14:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:14:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:14:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0319 23:14:14.455167 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:14:14.456483 543705 disk_worker.go:494] system disk:vda1
I0319 23:14:14.456526 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:14:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:14:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:14:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:14:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:14:16.472375 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:14:21.467087 543705 disk_info.go:125] begin check local disk info of client
I0319 23:14:21.469471 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:14:21.469477 543705 disk_info.go:196] parse disk info done, disk is : [0xc000328340 0xc000328380]
E0319 23:14:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:14:23.409788 543705 memory.go:184] no items to output this cycle
I0319 23:14:23.409797 543705 cpu.go:275] no items to output this cycle
E0319 23:14:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:14:33.409779 543705 memory.go:184] no items to output this cycle
I0319 23:14:33.409779 543705 cpu.go:275] no items to output this cycle
E0319 23:14:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:14:43.409804 543705 cpu.go:282] Add success.
I0319 23:14:43.409807 543705 memory.go:191] Add success.
I0319 23:14:43.419874 543705 net.go:648] Add success.
I0319 23:14:43.422623 543705 net.go:770] primary dev: ETH0
I0319 23:14:43.422637 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:14:43.422649 543705 net.go:698] Add success.
I0319 23:14:46.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:14:46.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:14:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:14:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:14:53.409790 543705 memory.go:184] no items to output this cycle
I0319 23:14:53.409804 543705 cpu.go:275] no items to output this cycle
E0319 23:15:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:15:03.409804 543705 memory.go:184] no items to output this cycle
I0319 23:15:03.409816 543705 cpu.go:275] no items to output this cycle
E0319 23:15:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:15:13.409814 543705 memory.go:191] Add success.
I0319 23:15:13.409822 543705 cpu.go:282] Add success.
W0319 23:15:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:15:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:15:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:15:13.420120 543705 net.go:648] Add success.
I0319 23:15:13.422778 543705 net.go:770] primary dev: ETH0
I0319 23:15:13.422792 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:15:13.422803 543705 net.go:698] Add success.
I0319 23:15:13.468959 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"030feaef-767f-4dd0-85a5-2e3b6f6e72a4","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 23:15:13.468993 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 23:15:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:15:14.455198 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:15:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0319 23:15:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:15:14.456720 543705 disk_worker.go:494] system disk:vda1
I0319 23:15:14.456755 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:15:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:15:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:15:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:15:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:15:16.472388 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:15:21.470030 543705 disk_info.go:125] begin check local disk info of client
I0319 23:15:21.472457 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:15:21.472462 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004755c0 0xc000475600]
E0319 23:15:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:15:23.409761 543705 memory.go:184] no items to output this cycle
I0319 23:15:23.409784 543705 cpu.go:275] no items to output this cycle
E0319 23:15:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:15:33.409774 543705 memory.go:184] no items to output this cycle
I0319 23:15:33.409793 543705 cpu.go:275] no items to output this cycle
I0319 23:15:37.948442 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 23:15:37.948449 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 23:15:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:15:43.410672 543705 memory.go:191] Add success.
I0319 23:15:43.409832 543705 cpu.go:282] Add success.
I0319 23:15:43.420461 543705 net.go:648] Add success.
I0319 23:15:43.423117 543705 net.go:770] primary dev: ETH0
I0319 23:15:43.423132 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:15:43.423146 543705 net.go:698] Add success.
I0319 23:15:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:15:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:15:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:15:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:15:53.409780 543705 cpu.go:275] no items to output this cycle
I0319 23:15:53.409782 543705 memory.go:184] no items to output this cycle
E0319 23:16:03.409891 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:16:03.409903 543705 cpu.go:275] no items to output this cycle
I0319 23:16:03.409910 543705 memory.go:184] no items to output this cycle
E0319 23:16:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:16:13.409793 543705 memory.go:191] Add success.
I0319 23:16:13.409796 543705 cpu.go:282] Add success.
W0319 23:16:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:16:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:16:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:16:13.420092 543705 net.go:648] Add success.
I0319 23:16:13.423195 543705 net.go:770] primary dev: ETH0
I0319 23:16:13.423207 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:16:13.423219 543705 net.go:698] Add success.
I0319 23:16:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:16:14.455158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:16:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0319 23:16:14.455171 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:16:14.456500 543705 disk_worker.go:494] system disk:vda1
I0319 23:16:14.456544 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:16:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:16:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:16:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:16:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:16:16.472427 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:16:21.473053 543705 disk_info.go:125] begin check local disk info of client
I0319 23:16:21.475506 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:16:21.475512 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329780 0xc0003297c0]
E0319 23:16:23.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:16:23.409806 543705 memory.go:184] no items to output this cycle
I0319 23:16:23.409821 543705 cpu.go:275] no items to output this cycle
E0319 23:16:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:16:33.409780 543705 memory.go:184] no items to output this cycle
I0319 23:16:33.409780 543705 cpu.go:275] no items to output this cycle
E0319 23:16:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:16:43.409813 543705 memory.go:191] Add success.
I0319 23:16:43.409819 543705 cpu.go:282] Add success.
I0319 23:16:43.419999 543705 net.go:648] Add success.
I0319 23:16:43.423786 543705 net.go:770] primary dev: ETH0
I0319 23:16:43.423798 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:16:43.423811 543705 net.go:698] Add success.
I0319 23:16:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:16:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:16:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:16:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:16:53.409782 543705 memory.go:184] no items to output this cycle
I0319 23:16:53.409783 543705 cpu.go:275] no items to output this cycle
E0319 23:17:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:17:03.409788 543705 memory.go:184] no items to output this cycle
I0319 23:17:03.409799 543705 cpu.go:275] no items to output this cycle
I0319 23:17:13.409916 543705 cpu.go:282] Add success.
E0319 23:17:13.410011 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:17:13.410034 543705 memory.go:191] Add success.
W0319 23:17:13.410065 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:17:13.410079 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:17:13.410082 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:17:13.419716 543705 net.go:648] Add success.
I0319 23:17:13.422324 543705 net.go:770] primary dev: ETH0
I0319 23:17:13.422337 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:17:13.422348 543705 net.go:698] Add success.
I0319 23:17:13.452859 543705 event_worker.go:152] Polling the log file for events...
W0319 23:17:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:17:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0319 23:17:14.455186 543705 disk_worker.go:728] disk inode is not compliant
E0319 23:17:14.455905 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 23:17:14.455913 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 23:17:14.455919 543705 custom_config.go:64] query custom config with name: gpu
I0319 23:17:14.456553 543705 disk_worker.go:494] system disk:vda1
I0319 23:17:14.456582 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 23:17:15.456835 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 23:17:15.456844 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:17:16.457962 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 23:17:16.457962 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 23:17:16.458017 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:17:16.458036 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:17:16.472450 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:17:21.476120 543705 disk_info.go:125] begin check local disk info of client
I0319 23:17:21.478603 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:17:21.478609 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329a80 0xc000329ac0]
E0319 23:17:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:17:23.409770 543705 memory.go:184] no items to output this cycle
I0319 23:17:23.409775 543705 cpu.go:275] no items to output this cycle
E0319 23:17:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:17:33.409791 543705 memory.go:184] no items to output this cycle
I0319 23:17:33.409803 543705 cpu.go:275] no items to output this cycle
E0319 23:17:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:17:43.409781 543705 memory.go:191] Add success.
I0319 23:17:43.409818 543705 cpu.go:282] Add success.
I0319 23:17:43.419951 543705 net.go:648] Add success.
I0319 23:17:43.422712 543705 net.go:770] primary dev: ETH0
I0319 23:17:43.422726 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:17:43.422739 543705 net.go:698] Add success.
I0319 23:17:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:17:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:17:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:17:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:17:53.409795 543705 memory.go:184] no items to output this cycle
I0319 23:17:53.409804 543705 cpu.go:275] no items to output this cycle
E0319 23:18:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:18:03.409799 543705 memory.go:184] no items to output this cycle
I0319 23:18:03.409812 543705 cpu.go:275] no items to output this cycle
E0319 23:18:13.409893 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:18:13.409927 543705 memory.go:191] Add success.
I0319 23:18:13.409935 543705 cpu.go:282] Add success.
W0319 23:18:13.409963 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:18:13.409978 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:18:13.409981 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:18:13.419730 543705 net.go:648] Add success.
I0319 23:18:13.422723 543705 net.go:770] primary dev: ETH0
I0319 23:18:13.422742 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:18:13.422761 543705 net.go:698] Add success.
I0319 23:18:13.468715 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2e91423c-e587-48d1-9938-5d6082c0be70","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 23:18:13.468748 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 23:18:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:18:14.455210 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:18:14.455220 543705 disk_worker.go:708] disk space is not compliant
W0319 23:18:14.455224 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:18:14.456609 543705 disk_worker.go:494] system disk:vda1
I0319 23:18:14.456640 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:18:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:18:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:18:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:18:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:18:16.472400 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:18:21.479098 543705 disk_info.go:125] begin check local disk info of client
I0319 23:18:21.481520 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:18:21.481527 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e2ac0 0xc0003e2b00]
E0319 23:18:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:18:23.409790 543705 memory.go:184] no items to output this cycle
I0319 23:18:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 23:18:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:18:33.409767 543705 memory.go:184] no items to output this cycle
I0319 23:18:33.409796 543705 cpu.go:275] no items to output this cycle
I0319 23:18:37.949730 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 23:18:37.949738 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 23:18:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:18:43.410786 543705 memory.go:191] Add success.
I0319 23:18:43.409814 543705 cpu.go:282] Add success.
I0319 23:18:43.420524 543705 net.go:648] Add success.
I0319 23:18:43.423156 543705 net.go:770] primary dev: ETH0
I0319 23:18:43.423171 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:18:43.423186 543705 net.go:698] Add success.
I0319 23:18:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:18:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:18:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:18:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:18:53.409767 543705 memory.go:184] no items to output this cycle
I0319 23:18:53.409801 543705 cpu.go:275] no items to output this cycle
E0319 23:19:03.409879 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:19:03.409918 543705 memory.go:184] no items to output this cycle
I0319 23:19:03.409934 543705 cpu.go:275] no items to output this cycle
E0319 23:19:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:19:13.409795 543705 memory.go:191] Add success.
I0319 23:19:13.409798 543705 cpu.go:282] Add success.
W0319 23:19:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:19:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:19:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:19:13.420347 543705 net.go:648] Add success.
I0319 23:19:13.423145 543705 net.go:770] primary dev: ETH0
I0319 23:19:13.423158 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:19:13.423170 543705 net.go:698] Add success.
I0319 23:19:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:19:14.455101 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:19:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0319 23:19:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:19:14.456597 543705 disk_worker.go:494] system disk:vda1
I0319 23:19:14.456627 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:19:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:19:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:19:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:19:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:19:16.472404 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:19:21.482106 543705 disk_info.go:125] begin check local disk info of client
I0319 23:19:21.484537 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:19:21.484543 543705 disk_info.go:196] parse disk info done, disk is : [0xc000515840 0xc000515880]
E0319 23:19:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:19:23.409796 543705 memory.go:184] no items to output this cycle
I0319 23:19:23.409808 543705 cpu.go:275] no items to output this cycle
E0319 23:19:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:19:33.409788 543705 memory.go:184] no items to output this cycle
I0319 23:19:33.409794 543705 cpu.go:275] no items to output this cycle
E0319 23:19:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:19:43.409802 543705 memory.go:191] Add success.
I0319 23:19:43.409803 543705 cpu.go:282] Add success.
I0319 23:19:43.419904 543705 net.go:648] Add success.
I0319 23:19:43.422896 543705 net.go:770] primary dev: ETH0
I0319 23:19:43.422911 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:19:43.422924 543705 net.go:698] Add success.
I0319 23:19:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:19:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:19:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:19:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:19:53.409783 543705 memory.go:184] no items to output this cycle
I0319 23:19:53.409784 543705 cpu.go:275] no items to output this cycle
E0319 23:20:03.409879 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:20:03.409903 543705 memory.go:184] no items to output this cycle
I0319 23:20:03.409902 543705 cpu.go:275] no items to output this cycle
E0319 23:20:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:20:13.409825 543705 memory.go:191] Add success.
I0319 23:20:13.409839 543705 cpu.go:282] Add success.
W0319 23:20:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:20:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:20:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:20:13.420133 543705 net.go:648] Add success.
I0319 23:20:13.422503 543705 net.go:770] primary dev: ETH0
I0319 23:20:13.422517 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:20:13.422530 543705 net.go:698] Add success.
I0319 23:20:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:20:14.455135 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:20:14.455145 543705 disk_worker.go:708] disk space is not compliant
W0319 23:20:14.455147 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:20:14.456501 543705 disk_worker.go:494] system disk:vda1
I0319 23:20:14.456545 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:20:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:20:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:20:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:20:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:20:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:20:21.485116 543705 disk_info.go:125] begin check local disk info of client
I0319 23:20:21.487549 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:20:21.487554 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab7c0 0xc0001ab800]
E0319 23:20:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:20:23.409790 543705 memory.go:184] no items to output this cycle
I0319 23:20:23.409804 543705 cpu.go:275] no items to output this cycle
E0319 23:20:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:20:33.409770 543705 memory.go:184] no items to output this cycle
I0319 23:20:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 23:20:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:20:43.409823 543705 memory.go:191] Add success.
I0319 23:20:43.409830 543705 cpu.go:282] Add success.
I0319 23:20:43.420048 543705 net.go:648] Add success.
I0319 23:20:43.422705 543705 net.go:770] primary dev: ETH0
I0319 23:20:43.422720 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:20:43.422733 543705 net.go:698] Add success.
I0319 23:20:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:20:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:20:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:20:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:20:53.409783 543705 memory.go:184] no items to output this cycle
I0319 23:20:53.409788 543705 cpu.go:275] no items to output this cycle
E0319 23:21:03.409848 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:21:03.409879 543705 memory.go:184] no items to output this cycle
I0319 23:21:03.409892 543705 cpu.go:275] no items to output this cycle
E0319 23:21:13.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:21:13.409775 543705 memory.go:191] Add success.
W0319 23:21:13.409801 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 23:21:13.409807 543705 cpu.go:282] Add success.
W0319 23:21:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:21:13.409817 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:21:13.420355 543705 net.go:648] Add success.
I0319 23:21:13.423204 543705 net.go:770] primary dev: ETH0
I0319 23:21:13.423224 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:21:13.423239 543705 net.go:698] Add success.
I0319 23:21:13.467604 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ae02aa2c-897c-4881-9ae0-e84f88b7260a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 23:21:13.467638 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 23:21:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:21:14.455144 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:21:14.455155 543705 disk_worker.go:708] disk space is not compliant
W0319 23:21:14.455157 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:21:14.456550 543705 disk_worker.go:494] system disk:vda1
I0319 23:21:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:21:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:21:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:21:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:21:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:21:16.472388 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:21:21.488123 543705 disk_info.go:125] begin check local disk info of client
I0319 23:21:21.490661 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:21:21.490668 543705 disk_info.go:196] parse disk info done, disk is : [0xc000346280 0xc0003462c0]
E0319 23:21:23.409742 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:21:23.409757 543705 memory.go:184] no items to output this cycle
I0319 23:21:23.409794 543705 cpu.go:275] no items to output this cycle
E0319 23:21:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:21:33.409784 543705 memory.go:184] no items to output this cycle
I0319 23:21:33.409787 543705 cpu.go:275] no items to output this cycle
I0319 23:21:37.952451 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 23:21:37.952458 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 23:21:43.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:21:43.410657 543705 memory.go:191] Add success.
I0319 23:21:43.409834 543705 cpu.go:282] Add success.
I0319 23:21:43.420333 543705 net.go:648] Add success.
I0319 23:21:43.423421 543705 net.go:770] primary dev: ETH0
I0319 23:21:43.423434 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:21:43.423452 543705 net.go:698] Add success.
I0319 23:21:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:21:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:21:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:21:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:21:53.409784 543705 memory.go:184] no items to output this cycle
I0319 23:21:53.409788 543705 cpu.go:275] no items to output this cycle
E0319 23:22:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:22:03.409798 543705 memory.go:184] no items to output this cycle
I0319 23:22:03.409811 543705 cpu.go:275] no items to output this cycle
E0319 23:22:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:22:13.409779 543705 memory.go:191] Add success.
W0319 23:22:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:22:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:22:13.409819 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:22:13.409821 543705 cpu.go:282] Add success.
I0319 23:22:13.420118 543705 net.go:648] Add success.
I0319 23:22:13.423044 543705 net.go:770] primary dev: ETH0
I0319 23:22:13.423056 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:22:13.423068 543705 net.go:698] Add success.
W0319 23:22:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:22:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0319 23:22:14.455178 543705 disk_worker.go:728] disk inode is not compliant
E0319 23:22:14.455858 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 23:22:14.455867 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 23:22:14.455873 543705 custom_config.go:64] query custom config with name: gpu
I0319 23:22:14.456557 543705 disk_worker.go:494] system disk:vda1
I0319 23:22:14.456597 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 23:22:15.456827 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 23:22:15.456835 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:22:16.457893 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 23:22:16.457893 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 23:22:16.457948 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:22:16.457967 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:22:16.472278 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:22:21.491095 543705 disk_info.go:125] begin check local disk info of client
I0319 23:22:21.493455 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:22:21.493461 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b980 0xc00007b9c0]
E0319 23:22:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:22:23.409787 543705 memory.go:184] no items to output this cycle
I0319 23:22:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 23:22:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:22:33.409786 543705 memory.go:184] no items to output this cycle
I0319 23:22:33.409792 543705 cpu.go:275] no items to output this cycle
E0319 23:22:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:22:43.409820 543705 memory.go:191] Add success.
I0319 23:22:43.409825 543705 cpu.go:282] Add success.
I0319 23:22:43.420139 543705 net.go:648] Add success.
I0319 23:22:43.422926 543705 net.go:770] primary dev: ETH0
I0319 23:22:43.422939 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:22:43.422950 543705 net.go:698] Add success.
I0319 23:22:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:22:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:22:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:22:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:22:53.409772 543705 memory.go:184] no items to output this cycle
I0319 23:22:53.409798 543705 cpu.go:275] no items to output this cycle
E0319 23:23:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:23:03.409782 543705 memory.go:184] no items to output this cycle
I0319 23:23:03.409804 543705 cpu.go:275] no items to output this cycle
E0319 23:23:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:23:13.409815 543705 memory.go:191] Add success.
I0319 23:23:13.409820 543705 cpu.go:282] Add success.
W0319 23:23:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:23:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:23:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:23:13.420120 543705 net.go:648] Add success.
I0319 23:23:13.422907 543705 net.go:770] primary dev: ETH0
I0319 23:23:13.422934 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:23:13.422946 543705 net.go:698] Add success.
I0319 23:23:14.454985 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:23:14.455134 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:23:14.455217 543705 disk_worker.go:708] disk space is not compliant
W0319 23:23:14.455220 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:23:14.456633 543705 disk_worker.go:494] system disk:vda1
I0319 23:23:14.456681 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:23:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:23:16.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:23:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:23:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:23:16.472441 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:23:21.494215 543705 disk_info.go:125] begin check local disk info of client
I0319 23:23:21.496622 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:23:21.496629 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001abd00 0xc0001abd40]
E0319 23:23:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:23:23.409803 543705 memory.go:184] no items to output this cycle
I0319 23:23:23.409817 543705 cpu.go:275] no items to output this cycle
E0319 23:23:33.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:23:33.409799 543705 cpu.go:275] no items to output this cycle
I0319 23:23:33.409809 543705 memory.go:184] no items to output this cycle
E0319 23:23:43.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:23:43.409832 543705 memory.go:191] Add success.
I0319 23:23:43.409837 543705 cpu.go:282] Add success.
I0319 23:23:43.420090 543705 net.go:648] Add success.
I0319 23:23:43.422903 543705 net.go:770] primary dev: ETH0
I0319 23:23:43.422916 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:23:43.422928 543705 net.go:698] Add success.
I0319 23:23:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:23:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:23:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:23:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:23:53.409804 543705 memory.go:184] no items to output this cycle
I0319 23:23:53.409815 543705 cpu.go:275] no items to output this cycle
E0319 23:24:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:24:03.409787 543705 memory.go:184] no items to output this cycle
I0319 23:24:03.409804 543705 cpu.go:275] no items to output this cycle
E0319 23:24:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:24:13.409823 543705 memory.go:191] Add success.
I0319 23:24:13.409834 543705 cpu.go:282] Add success.
W0319 23:24:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:24:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:24:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:24:13.420206 543705 net.go:648] Add success.
I0319 23:24:13.422699 543705 net.go:770] primary dev: ETH0
I0319 23:24:13.422714 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:24:13.422729 543705 net.go:698] Add success.
I0319 23:24:13.463161 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ea7c5371-b4a3-4a57-9053-b9023e4f39d5","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 23:24:13.463194 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 23:24:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:24:14.455187 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:24:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0319 23:24:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:24:14.456603 543705 disk_worker.go:494] system disk:vda1
I0319 23:24:14.456635 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:24:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:24:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:24:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:24:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:24:16.472371 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:24:21.497230 543705 disk_info.go:125] begin check local disk info of client
I0319 23:24:21.499637 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:24:21.499643 543705 disk_info.go:196] parse disk info done, disk is : [0xc000314f00 0xc000314f40]
E0319 23:24:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:24:23.409785 543705 memory.go:184] no items to output this cycle
I0319 23:24:23.409805 543705 cpu.go:275] no items to output this cycle
E0319 23:24:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:24:33.409776 543705 memory.go:184] no items to output this cycle
I0319 23:24:33.409817 543705 cpu.go:275] no items to output this cycle
I0319 23:24:37.953730 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 23:24:37.953737 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 23:24:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:24:43.410650 543705 memory.go:191] Add success.
I0319 23:24:43.409810 543705 cpu.go:282] Add success.
I0319 23:24:43.419685 543705 net.go:648] Add success.
I0319 23:24:43.422419 543705 net.go:770] primary dev: ETH0
I0319 23:24:43.422432 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:24:43.422444 543705 net.go:698] Add success.
I0319 23:24:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:24:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:24:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:24:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:24:53.409770 543705 memory.go:184] no items to output this cycle
I0319 23:24:53.409799 543705 cpu.go:275] no items to output this cycle
E0319 23:25:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:25:03.409787 543705 memory.go:184] no items to output this cycle
I0319 23:25:03.409804 543705 cpu.go:275] no items to output this cycle
E0319 23:25:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:25:13.409781 543705 memory.go:191] Add success.
W0319 23:25:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 23:25:13.409809 543705 cpu.go:282] Add success.
W0319 23:25:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:25:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:25:13.420126 543705 net.go:648] Add success.
I0319 23:25:13.422769 543705 net.go:770] primary dev: ETH0
I0319 23:25:13.422782 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:25:13.422795 543705 net.go:698] Add success.
I0319 23:25:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:25:14.455118 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:25:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0319 23:25:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:25:14.456594 543705 disk_worker.go:494] system disk:vda1
I0319 23:25:14.456623 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:25:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:25:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:25:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:25:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:25:16.472394 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:25:21.500247 543705 disk_info.go:125] begin check local disk info of client
I0319 23:25:21.502683 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:25:21.502688 543705 disk_info.go:196] parse disk info done, disk is : [0xc000384300 0xc000384340]
E0319 23:25:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:25:23.409789 543705 memory.go:184] no items to output this cycle
I0319 23:25:23.409801 543705 cpu.go:275] no items to output this cycle
E0319 23:25:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:25:33.409806 543705 memory.go:184] no items to output this cycle
I0319 23:25:33.409818 543705 cpu.go:275] no items to output this cycle
E0319 23:25:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:25:43.409921 543705 memory.go:191] Add success.
I0319 23:25:43.409960 543705 cpu.go:282] Add success.
I0319 23:25:43.419740 543705 net.go:648] Add success.
I0319 23:25:43.422462 543705 net.go:770] primary dev: ETH0
I0319 23:25:43.422475 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:25:43.422486 543705 net.go:698] Add success.
I0319 23:25:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:25:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:25:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:25:53.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:25:53.409763 543705 memory.go:184] no items to output this cycle
I0319 23:25:53.409797 543705 cpu.go:275] no items to output this cycle
E0319 23:26:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:26:03.409774 543705 memory.go:184] no items to output this cycle
I0319 23:26:03.409785 543705 cpu.go:275] no items to output this cycle
E0319 23:26:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:26:13.409793 543705 memory.go:191] Add success.
I0319 23:26:13.409794 543705 cpu.go:282] Add success.
W0319 23:26:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:26:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:26:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:26:13.420501 543705 net.go:648] Add success.
I0319 23:26:13.423437 543705 net.go:770] primary dev: ETH0
I0319 23:26:13.423451 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:26:13.423463 543705 net.go:698] Add success.
I0319 23:26:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:26:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:26:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0319 23:26:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:26:14.456511 543705 disk_worker.go:494] system disk:vda1
I0319 23:26:14.456560 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:26:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:26:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:26:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:26:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:26:16.472392 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:26:21.503266 543705 disk_info.go:125] begin check local disk info of client
I0319 23:26:21.505682 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:26:21.505687 543705 disk_info.go:196] parse disk info done, disk is : [0xc000315740 0xc000315780]
E0319 23:26:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:26:23.409780 543705 memory.go:184] no items to output this cycle
I0319 23:26:23.409794 543705 cpu.go:275] no items to output this cycle
E0319 23:26:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:26:33.409783 543705 memory.go:184] no items to output this cycle
I0319 23:26:33.409789 543705 cpu.go:275] no items to output this cycle
E0319 23:26:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:26:43.409795 543705 memory.go:191] Add success.
I0319 23:26:43.409796 543705 cpu.go:282] Add success.
I0319 23:26:43.420069 543705 net.go:648] Add success.
I0319 23:26:43.422660 543705 net.go:770] primary dev: ETH0
I0319 23:26:43.422673 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:26:43.422684 543705 net.go:698] Add success.
I0319 23:26:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:26:46.458063 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:26:46.458091 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:26:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:26:53.409796 543705 memory.go:184] no items to output this cycle
I0319 23:26:53.409808 543705 cpu.go:275] no items to output this cycle
E0319 23:27:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:27:03.409786 543705 cpu.go:275] no items to output this cycle
I0319 23:27:03.409799 543705 memory.go:184] no items to output this cycle
E0319 23:27:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:27:13.409796 543705 memory.go:191] Add success.
I0319 23:27:13.409801 543705 cpu.go:282] Add success.
W0319 23:27:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:27:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:27:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:27:13.420120 543705 net.go:648] Add success.
I0319 23:27:13.423150 543705 net.go:770] primary dev: ETH0
I0319 23:27:13.423162 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:27:13.423173 543705 net.go:698] Add success.
I0319 23:27:13.429508 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 23:27:13.453662 543705 event_worker.go:152] Polling the log file for events...
I0319 23:27:13.469158 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a8f304f4-594a-4131-b1bb-8fc46fe42e16","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 23:27:13.469191 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 23:27:14.455145 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:27:14.455156 543705 disk_worker.go:708] disk space is not compliant
W0319 23:27:14.455158 543705 disk_worker.go:728] disk inode is not compliant
E0319 23:27:14.456135 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 23:27:14.456143 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 23:27:14.456149 543705 custom_config.go:64] query custom config with name: gpu
I0319 23:27:14.456411 543705 disk_worker.go:494] system disk:vda1
I0319 23:27:14.456441 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 23:27:15.456789 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 23:27:15.456797 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:27:16.457945 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 23:27:16.457944 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 23:27:16.457999 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:27:16.458017 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:27:16.472403 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:27:21.506171 543705 disk_info.go:125] begin check local disk info of client
I0319 23:27:21.508563 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:27:21.508570 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329bc0 0xc000329c00]
E0319 23:27:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:27:23.409789 543705 memory.go:184] no items to output this cycle
I0319 23:27:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 23:27:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:27:33.409802 543705 memory.go:184] no items to output this cycle
I0319 23:27:33.409817 543705 cpu.go:275] no items to output this cycle
I0319 23:27:37.956470 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 23:27:37.956477 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 23:27:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:27:43.410651 543705 memory.go:191] Add success.
I0319 23:27:43.409810 543705 cpu.go:282] Add success.
I0319 23:27:43.420430 543705 net.go:648] Add success.
I0319 23:27:43.423351 543705 net.go:770] primary dev: ETH0
I0319 23:27:43.423364 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:27:43.423376 543705 net.go:698] Add success.
I0319 23:27:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:27:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:27:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:27:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:27:53.409801 543705 memory.go:184] no items to output this cycle
I0319 23:27:53.409812 543705 cpu.go:275] no items to output this cycle
E0319 23:28:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:28:03.409797 543705 memory.go:184] no items to output this cycle
I0319 23:28:03.409810 543705 cpu.go:275] no items to output this cycle
E0319 23:28:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:28:13.409822 543705 memory.go:191] Add success.
I0319 23:28:13.409825 543705 cpu.go:282] Add success.
W0319 23:28:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:28:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:28:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:28:13.420173 543705 net.go:648] Add success.
I0319 23:28:13.422969 543705 net.go:770] primary dev: ETH0
I0319 23:28:13.422982 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:28:13.422993 543705 net.go:698] Add success.
I0319 23:28:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:28:14.455164 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:28:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0319 23:28:14.455179 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:28:14.456558 543705 disk_worker.go:494] system disk:vda1
I0319 23:28:14.456588 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:28:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:28:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:28:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:28:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:28:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:28:21.509293 543705 disk_info.go:125] begin check local disk info of client
I0319 23:28:21.511676 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:28:21.511683 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c46c0 0xc0000c4700]
E0319 23:28:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:28:23.409800 543705 memory.go:184] no items to output this cycle
I0319 23:28:23.409810 543705 cpu.go:275] no items to output this cycle
E0319 23:28:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:28:33.409768 543705 memory.go:184] no items to output this cycle
I0319 23:28:33.409809 543705 cpu.go:275] no items to output this cycle
E0319 23:28:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:28:43.409803 543705 memory.go:191] Add success.
I0319 23:28:43.409807 543705 cpu.go:282] Add success.
I0319 23:28:43.419950 543705 net.go:648] Add success.
I0319 23:28:43.422941 543705 net.go:770] primary dev: ETH0
I0319 23:28:43.422956 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:28:43.422972 543705 net.go:698] Add success.
I0319 23:28:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:28:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:28:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:28:53.409922 543705 cpu.go:275] no items to output this cycle
E0319 23:28:53.409931 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:28:53.410059 543705 memory.go:184] no items to output this cycle
E0319 23:29:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:29:03.409788 543705 memory.go:184] no items to output this cycle
I0319 23:29:03.409802 543705 cpu.go:275] no items to output this cycle
E0319 23:29:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:29:13.409780 543705 memory.go:191] Add success.
W0319 23:29:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 23:29:13.409808 543705 cpu.go:282] Add success.
W0319 23:29:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:29:13.409818 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:29:13.420148 543705 net.go:648] Add success.
I0319 23:29:13.423817 543705 net.go:770] primary dev: ETH0
I0319 23:29:13.423830 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:29:13.423842 543705 net.go:698] Add success.
I0319 23:29:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:29:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:29:14.455160 543705 disk_worker.go:708] disk space is not compliant
W0319 23:29:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:29:14.456495 543705 disk_worker.go:494] system disk:vda1
I0319 23:29:14.456538 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:29:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:29:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:29:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:29:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:29:16.472386 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:29:21.511708 543705 disk_info.go:125] begin check local disk info of client
I0319 23:29:21.514302 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:29:21.514316 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ff580 0xc0003ff5c0]
E0319 23:29:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:29:23.409786 543705 memory.go:184] no items to output this cycle
I0319 23:29:23.409799 543705 cpu.go:275] no items to output this cycle
E0319 23:29:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:29:33.409778 543705 memory.go:184] no items to output this cycle
I0319 23:29:33.409804 543705 cpu.go:275] no items to output this cycle
E0319 23:29:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:29:43.409802 543705 memory.go:191] Add success.
I0319 23:29:43.409805 543705 cpu.go:282] Add success.
I0319 23:29:43.419970 543705 net.go:648] Add success.
I0319 23:29:43.422941 543705 net.go:770] primary dev: ETH0
I0319 23:29:43.422954 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:29:43.422967 543705 net.go:698] Add success.
I0319 23:29:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:29:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:29:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:29:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:29:53.409804 543705 memory.go:184] no items to output this cycle
I0319 23:29:53.409811 543705 cpu.go:275] no items to output this cycle
E0319 23:30:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:30:03.409766 543705 memory.go:184] no items to output this cycle
I0319 23:30:03.409796 543705 cpu.go:275] no items to output this cycle
E0319 23:30:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:30:13.409805 543705 memory.go:191] Add success.
I0319 23:30:13.409806 543705 cpu.go:282] Add success.
W0319 23:30:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:30:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:30:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:30:13.420368 543705 net.go:648] Add success.
I0319 23:30:13.422819 543705 net.go:770] primary dev: ETH0
I0319 23:30:13.422835 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:30:13.422849 543705 net.go:698] Add success.
I0319 23:30:13.468944 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a612783a-87e5-497b-93af-16643dcfaa39","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 23:30:13.468977 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 23:30:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:30:14.455139 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:30:14.455152 543705 disk_worker.go:708] disk space is not compliant
W0319 23:30:14.455154 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:30:14.456488 543705 disk_worker.go:494] system disk:vda1
I0319 23:30:14.456534 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:30:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:30:16.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:30:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:30:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:30:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:30:21.515259 543705 disk_info.go:125] begin check local disk info of client
I0319 23:30:21.517693 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:30:21.517699 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329a40 0xc000329a80]
E0319 23:30:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:30:23.409805 543705 memory.go:184] no items to output this cycle
I0319 23:30:23.409814 543705 cpu.go:275] no items to output this cycle
E0319 23:30:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:30:33.409781 543705 memory.go:184] no items to output this cycle
I0319 23:30:33.409788 543705 cpu.go:275] no items to output this cycle
I0319 23:30:37.957732 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 23:30:37.957738 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 23:30:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:30:43.410700 543705 memory.go:191] Add success.
I0319 23:30:43.409797 543705 cpu.go:282] Add success.
I0319 23:30:43.420418 543705 net.go:648] Add success.
I0319 23:30:43.423003 543705 net.go:770] primary dev: ETH0
I0319 23:30:43.423018 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:30:43.423031 543705 net.go:698] Add success.
I0319 23:30:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:30:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:30:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:30:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:30:53.409773 543705 memory.go:184] no items to output this cycle
I0319 23:30:53.409782 543705 cpu.go:275] no items to output this cycle
E0319 23:31:03.410286 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:31:03.410305 543705 memory.go:184] no items to output this cycle
I0319 23:31:03.410321 543705 cpu.go:275] no items to output this cycle
E0319 23:31:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:31:13.409795 543705 memory.go:191] Add success.
I0319 23:31:13.409800 543705 cpu.go:282] Add success.
W0319 23:31:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:31:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:31:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:31:13.420252 543705 net.go:648] Add success.
I0319 23:31:13.422970 543705 net.go:770] primary dev: ETH0
I0319 23:31:13.422983 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:31:13.422995 543705 net.go:698] Add success.
I0319 23:31:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:31:14.455191 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:31:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0319 23:31:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:31:14.456611 543705 disk_worker.go:494] system disk:vda1
I0319 23:31:14.456640 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:31:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:31:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:31:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:31:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:31:16.472404 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:31:21.518234 543705 disk_info.go:125] begin check local disk info of client
I0319 23:31:21.520673 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:31:21.520679 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004744c0 0xc000474500]
E0319 23:31:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:31:23.409804 543705 memory.go:184] no items to output this cycle
I0319 23:31:23.409814 543705 cpu.go:275] no items to output this cycle
E0319 23:31:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:31:33.409803 543705 memory.go:184] no items to output this cycle
I0319 23:31:33.409814 543705 cpu.go:275] no items to output this cycle
E0319 23:31:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:31:43.409791 543705 memory.go:191] Add success.
I0319 23:31:43.409817 543705 cpu.go:282] Add success.
I0319 23:31:43.419881 543705 net.go:648] Add success.
I0319 23:31:43.422742 543705 net.go:770] primary dev: ETH0
I0319 23:31:43.422755 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:31:43.422767 543705 net.go:698] Add success.
I0319 23:31:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:31:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:31:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:31:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:31:53.409778 543705 memory.go:184] no items to output this cycle
I0319 23:31:53.409798 543705 cpu.go:275] no items to output this cycle
E0319 23:32:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:32:03.409798 543705 memory.go:184] no items to output this cycle
I0319 23:32:03.409810 543705 cpu.go:275] no items to output this cycle
E0319 23:32:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:32:13.409794 543705 memory.go:191] Add success.
I0319 23:32:13.409812 543705 cpu.go:282] Add success.
W0319 23:32:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:32:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:32:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:32:13.420072 543705 net.go:648] Add success.
I0319 23:32:13.422902 543705 net.go:770] primary dev: ETH0
I0319 23:32:13.422916 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:32:13.422929 543705 net.go:698] Add success.
W0319 23:32:14.455160 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:32:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0319 23:32:14.455173 543705 disk_worker.go:728] disk inode is not compliant
E0319 23:32:14.455851 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 23:32:14.455860 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 23:32:14.455866 543705 custom_config.go:64] query custom config with name: gpu
I0319 23:32:14.456556 543705 disk_worker.go:494] system disk:vda1
I0319 23:32:14.456600 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 23:32:15.456777 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 23:32:15.456786 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:32:16.457897 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 23:32:16.457897 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 23:32:16.457949 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:32:16.457968 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:32:16.472275 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:32:21.521258 543705 disk_info.go:125] begin check local disk info of client
I0319 23:32:21.523680 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:32:21.523687 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329380 0xc0003293c0]
E0319 23:32:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:32:23.409793 543705 memory.go:184] no items to output this cycle
I0319 23:32:23.409802 543705 cpu.go:275] no items to output this cycle
E0319 23:32:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:32:33.409779 543705 memory.go:184] no items to output this cycle
I0319 23:32:33.409782 543705 cpu.go:275] no items to output this cycle
E0319 23:32:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:32:43.409806 543705 memory.go:191] Add success.
I0319 23:32:43.409808 543705 cpu.go:282] Add success.
I0319 23:32:43.419936 543705 net.go:648] Add success.
I0319 23:32:43.422576 543705 net.go:770] primary dev: ETH0
I0319 23:32:43.422590 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:32:43.422606 543705 net.go:698] Add success.
I0319 23:32:46.458500 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:32:46.458575 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:32:46.458596 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:32:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:32:53.409778 543705 memory.go:184] no items to output this cycle
I0319 23:32:53.409791 543705 cpu.go:275] no items to output this cycle
E0319 23:33:03.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:33:03.409818 543705 memory.go:184] no items to output this cycle
I0319 23:33:03.409830 543705 cpu.go:275] no items to output this cycle
E0319 23:33:13.409853 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:33:13.409885 543705 memory.go:191] Add success.
W0319 23:33:13.409913 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:33:13.409949 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:33:13.409956 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:33:13.410001 543705 cpu.go:282] Add success.
I0319 23:33:13.419726 543705 net.go:648] Add success.
I0319 23:33:13.422532 543705 net.go:770] primary dev: ETH0
I0319 23:33:13.422545 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:33:13.422556 543705 net.go:698] Add success.
I0319 23:33:13.469260 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d9c556af-2412-470d-844e-07ce1b5c4808","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 23:33:13.469291 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 23:33:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:33:14.455140 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:33:14.455150 543705 disk_worker.go:708] disk space is not compliant
W0319 23:33:14.455153 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:33:14.456499 543705 disk_worker.go:494] system disk:vda1
I0319 23:33:14.456542 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:33:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:33:16.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:33:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:33:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:33:16.472432 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:33:21.524274 543705 disk_info.go:125] begin check local disk info of client
I0319 23:33:21.526746 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:33:21.526753 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003f2740 0xc0003f2780]
E0319 23:33:23.409818 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:33:23.409839 543705 memory.go:184] no items to output this cycle
I0319 23:33:23.409848 543705 cpu.go:275] no items to output this cycle
E0319 23:33:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:33:33.409808 543705 memory.go:184] no items to output this cycle
I0319 23:33:33.409820 543705 cpu.go:275] no items to output this cycle
I0319 23:33:37.960493 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 23:33:37.960499 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 23:33:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:33:43.410653 543705 memory.go:191] Add success.
I0319 23:33:43.409827 543705 cpu.go:282] Add success.
I0319 23:33:43.420360 543705 net.go:648] Add success.
I0319 23:33:43.422960 543705 net.go:770] primary dev: ETH0
I0319 23:33:43.422975 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:33:43.422989 543705 net.go:698] Add success.
I0319 23:33:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:33:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:33:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:33:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:33:53.409788 543705 cpu.go:275] no items to output this cycle
I0319 23:33:53.409790 543705 memory.go:184] no items to output this cycle
E0319 23:34:03.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:34:03.409820 543705 memory.go:184] no items to output this cycle
I0319 23:34:03.409836 543705 cpu.go:275] no items to output this cycle
W0319 23:34:13.409705 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:34:13.409721 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:34:13.409725 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0319 23:34:13.409811 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:34:13.409947 543705 cpu.go:282] Add success.
I0319 23:34:13.410039 543705 memory.go:191] Add success.
I0319 23:34:13.419736 543705 net.go:648] Add success.
I0319 23:34:13.423225 543705 net.go:770] primary dev: ETH0
I0319 23:34:13.423239 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:34:13.423251 543705 net.go:698] Add success.
I0319 23:34:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:34:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:34:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0319 23:34:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:34:14.456525 543705 disk_worker.go:494] system disk:vda1
I0319 23:34:14.456553 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:34:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:34:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:34:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:34:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:34:16.472383 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:34:21.527283 543705 disk_info.go:125] begin check local disk info of client
I0319 23:34:21.529755 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:34:21.529761 543705 disk_info.go:196] parse disk info done, disk is : [0xc000498540 0xc000498580]
E0319 23:34:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:34:23.409793 543705 memory.go:184] no items to output this cycle
I0319 23:34:23.409814 543705 cpu.go:275] no items to output this cycle
E0319 23:34:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:34:33.409770 543705 memory.go:184] no items to output this cycle
I0319 23:34:33.409808 543705 cpu.go:275] no items to output this cycle
E0319 23:34:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:34:43.409789 543705 memory.go:191] Add success.
I0319 23:34:43.409809 543705 cpu.go:282] Add success.
I0319 23:34:43.419985 543705 net.go:648] Add success.
I0319 23:34:43.422569 543705 net.go:770] primary dev: ETH0
I0319 23:34:43.422582 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:34:43.422593 543705 net.go:698] Add success.
I0319 23:34:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:34:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:34:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:34:53.410359 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:34:53.410375 543705 memory.go:184] no items to output this cycle
I0319 23:34:53.410412 543705 cpu.go:275] no items to output this cycle
E0319 23:35:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:35:03.409813 543705 memory.go:184] no items to output this cycle
I0319 23:35:03.409821 543705 cpu.go:275] no items to output this cycle
E0319 23:35:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:35:13.409812 543705 memory.go:191] Add success.
I0319 23:35:13.409821 543705 cpu.go:282] Add success.
W0319 23:35:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:35:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:35:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:35:13.420807 543705 net.go:648] Add success.
I0319 23:35:13.423751 543705 net.go:770] primary dev: ETH0
I0319 23:35:13.423766 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:35:13.423780 543705 net.go:698] Add success.
I0319 23:35:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:35:14.455135 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:35:14.455145 543705 disk_worker.go:708] disk space is not compliant
W0319 23:35:14.455148 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:35:14.456508 543705 disk_worker.go:494] system disk:vda1
I0319 23:35:14.456550 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:35:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:35:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:35:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:35:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:35:16.472436 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:35:21.530315 543705 disk_info.go:125] begin check local disk info of client
I0319 23:35:21.532786 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:35:21.532793 543705 disk_info.go:196] parse disk info done, disk is : [0xc000538b00 0xc000538b40]
E0319 23:35:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:35:23.409790 543705 memory.go:184] no items to output this cycle
I0319 23:35:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 23:35:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:35:33.409780 543705 memory.go:184] no items to output this cycle
I0319 23:35:33.409785 543705 cpu.go:275] no items to output this cycle
E0319 23:35:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:35:43.409803 543705 memory.go:191] Add success.
I0319 23:35:43.409804 543705 cpu.go:282] Add success.
I0319 23:35:43.419991 543705 net.go:648] Add success.
I0319 23:35:43.422743 543705 net.go:770] primary dev: ETH0
I0319 23:35:43.422756 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:35:43.422769 543705 net.go:698] Add success.
I0319 23:35:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:35:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:35:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:35:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:35:53.409765 543705 memory.go:184] no items to output this cycle
I0319 23:35:53.409801 543705 cpu.go:275] no items to output this cycle
E0319 23:36:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:36:03.409769 543705 memory.go:184] no items to output this cycle
I0319 23:36:03.409801 543705 cpu.go:275] no items to output this cycle
E0319 23:36:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:36:13.409796 543705 memory.go:191] Add success.
I0319 23:36:13.409796 543705 cpu.go:282] Add success.
W0319 23:36:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:36:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:36:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:36:13.420129 543705 net.go:648] Add success.
I0319 23:36:13.423052 543705 net.go:770] primary dev: ETH0
I0319 23:36:13.423067 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:36:13.423078 543705 net.go:698] Add success.
I0319 23:36:13.467563 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"672b49c2-42b1-47f5-8eb7-cd96e4d8afb9","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 23:36:13.467595 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 23:36:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:36:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:36:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0319 23:36:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:36:14.456625 543705 disk_worker.go:494] system disk:vda1
I0319 23:36:14.456653 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:36:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:36:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:36:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:36:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:36:16.472397 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:36:21.533309 543705 disk_info.go:125] begin check local disk info of client
I0319 23:36:21.535793 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:36:21.535800 543705 disk_info.go:196] parse disk info done, disk is : [0xc0005823c0 0xc000582400]
E0319 23:36:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:36:23.409795 543705 memory.go:184] no items to output this cycle
I0319 23:36:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 23:36:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:36:33.409803 543705 memory.go:184] no items to output this cycle
I0319 23:36:33.409814 543705 cpu.go:275] no items to output this cycle
I0319 23:36:37.961734 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 23:36:37.961741 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 23:36:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:36:43.410682 543705 memory.go:191] Add success.
I0319 23:36:43.409830 543705 cpu.go:282] Add success.
I0319 23:36:43.420391 543705 net.go:648] Add success.
I0319 23:36:43.423001 543705 net.go:770] primary dev: ETH0
I0319 23:36:43.423016 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:36:43.423042 543705 net.go:698] Add success.
I0319 23:36:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:36:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:36:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:36:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:36:53.409782 543705 memory.go:184] no items to output this cycle
I0319 23:36:53.409783 543705 cpu.go:275] no items to output this cycle
E0319 23:37:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:37:03.409802 543705 memory.go:184] no items to output this cycle
I0319 23:37:03.409808 543705 cpu.go:275] no items to output this cycle
E0319 23:37:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:37:13.409795 543705 memory.go:191] Add success.
I0319 23:37:13.409796 543705 cpu.go:282] Add success.
W0319 23:37:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:37:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:37:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:37:13.420053 543705 net.go:648] Add success.
I0319 23:37:13.422920 543705 net.go:770] primary dev: ETH0
I0319 23:37:13.422936 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:37:13.422950 543705 net.go:698] Add success.
I0319 23:37:13.453494 543705 event_worker.go:152] Polling the log file for events...
W0319 23:37:14.455372 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:37:14.455483 543705 disk_worker.go:708] disk space is not compliant
W0319 23:37:14.455486 543705 disk_worker.go:728] disk inode is not compliant
E0319 23:37:14.456514 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 23:37:14.456521 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 23:37:14.456525 543705 custom_config.go:64] query custom config with name: gpu
I0319 23:37:14.457697 543705 disk_worker.go:494] system disk:vda1
I0319 23:37:14.457739 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 23:37:15.456805 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 23:37:15.456814 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:37:16.458062 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 23:37:16.458064 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 23:37:16.458133 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:37:16.458156 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:37:16.472527 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:37:21.536323 543705 disk_info.go:125] begin check local disk info of client
I0319 23:37:21.538766 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:37:21.538773 543705 disk_info.go:196] parse disk info done, disk is : [0xc000517340 0xc000517840]
E0319 23:37:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:37:23.409788 543705 memory.go:184] no items to output this cycle
I0319 23:37:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 23:37:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:37:33.409778 543705 memory.go:184] no items to output this cycle
I0319 23:37:33.409784 543705 cpu.go:275] no items to output this cycle
E0319 23:37:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:37:43.409796 543705 memory.go:191] Add success.
I0319 23:37:43.409807 543705 cpu.go:282] Add success.
I0319 23:37:43.419873 543705 net.go:648] Add success.
I0319 23:37:43.422332 543705 net.go:770] primary dev: ETH0
I0319 23:37:43.422345 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:37:43.422357 543705 net.go:698] Add success.
I0319 23:37:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:37:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:37:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:37:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:37:53.409778 543705 memory.go:184] no items to output this cycle
I0319 23:37:53.409783 543705 cpu.go:275] no items to output this cycle
E0319 23:38:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:38:03.409782 543705 memory.go:184] no items to output this cycle
I0319 23:38:03.409783 543705 cpu.go:275] no items to output this cycle
E0319 23:38:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:38:13.409803 543705 memory.go:191] Add success.
I0319 23:38:13.409808 543705 cpu.go:282] Add success.
W0319 23:38:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:38:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:38:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:38:13.420057 543705 net.go:648] Add success.
I0319 23:38:13.423020 543705 net.go:770] primary dev: ETH0
I0319 23:38:13.423034 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:38:13.423045 543705 net.go:698] Add success.
I0319 23:38:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:38:14.455367 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:38:14.455465 543705 disk_worker.go:708] disk space is not compliant
W0319 23:38:14.455518 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:38:14.457106 543705 disk_worker.go:494] system disk:vda1
I0319 23:38:14.457135 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:38:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:38:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:38:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:38:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:38:16.472421 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:38:21.539449 543705 disk_info.go:125] begin check local disk info of client
I0319 23:38:21.541935 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:38:21.541943 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a4000 0xc0002a4080]
E0319 23:38:23.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:38:23.409761 543705 memory.go:184] no items to output this cycle
I0319 23:38:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 23:38:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:38:33.409765 543705 memory.go:184] no items to output this cycle
I0319 23:38:33.409797 543705 cpu.go:275] no items to output this cycle
E0319 23:38:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:38:43.409792 543705 memory.go:191] Add success.
I0319 23:38:43.409811 543705 cpu.go:282] Add success.
I0319 23:38:43.420138 543705 net.go:648] Add success.
I0319 23:38:43.423064 543705 net.go:770] primary dev: ETH0
I0319 23:38:43.423076 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:38:43.423090 543705 net.go:698] Add success.
I0319 23:38:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:38:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:38:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:38:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:38:53.409767 543705 memory.go:184] no items to output this cycle
I0319 23:38:53.409802 543705 cpu.go:275] no items to output this cycle
E0319 23:39:03.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:39:03.409817 543705 memory.go:184] no items to output this cycle
I0319 23:39:03.409826 543705 cpu.go:275] no items to output this cycle
E0319 23:39:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:39:13.409790 543705 memory.go:191] Add success.
I0319 23:39:13.409791 543705 cpu.go:282] Add success.
W0319 23:39:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:39:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:39:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:39:13.420203 543705 net.go:648] Add success.
I0319 23:39:13.422963 543705 net.go:770] primary dev: ETH0
I0319 23:39:13.422976 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:39:13.422988 543705 net.go:698] Add success.
I0319 23:39:13.471180 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ae511726-ce6f-4682-8be7-3894fdb00e48","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 23:39:13.471210 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 23:39:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:39:14.455144 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:39:14.455221 543705 disk_worker.go:708] disk space is not compliant
W0319 23:39:14.455224 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:39:14.456629 543705 disk_worker.go:494] system disk:vda1
I0319 23:39:14.456662 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:39:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:39:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:39:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:39:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:39:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:39:21.542351 543705 disk_info.go:125] begin check local disk info of client
I0319 23:39:21.544844 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:39:21.544851 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002ba2c0 0xc0002ba300]
E0319 23:39:23.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:39:23.409761 543705 memory.go:184] no items to output this cycle
I0319 23:39:23.409793 543705 cpu.go:275] no items to output this cycle
E0319 23:39:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:39:33.409800 543705 memory.go:184] no items to output this cycle
I0319 23:39:33.409810 543705 cpu.go:275] no items to output this cycle
I0319 23:39:37.964508 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 23:39:37.964515 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 23:39:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:39:43.410611 543705 memory.go:191] Add success.
I0319 23:39:43.409830 543705 cpu.go:282] Add success.
I0319 23:39:43.420310 543705 net.go:648] Add success.
I0319 23:39:43.422855 543705 net.go:770] primary dev: ETH0
I0319 23:39:43.422868 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:39:43.422881 543705 net.go:698] Add success.
I0319 23:39:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:39:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:39:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:39:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:39:53.409789 543705 memory.go:184] no items to output this cycle
I0319 23:39:53.409804 543705 cpu.go:275] no items to output this cycle
E0319 23:40:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:40:03.409771 543705 memory.go:184] no items to output this cycle
I0319 23:40:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 23:40:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:40:13.409793 543705 memory.go:191] Add success.
I0319 23:40:13.409811 543705 cpu.go:282] Add success.
W0319 23:40:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:40:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:40:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:40:13.420141 543705 net.go:648] Add success.
I0319 23:40:13.422596 543705 net.go:770] primary dev: ETH0
I0319 23:40:13.422611 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:40:13.422624 543705 net.go:698] Add success.
I0319 23:40:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:40:14.455327 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:40:14.455434 543705 disk_worker.go:708] disk space is not compliant
W0319 23:40:14.455443 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:40:14.457515 543705 disk_worker.go:494] system disk:vda1
I0319 23:40:14.457557 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:40:15.455950 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:40:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:40:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:40:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:40:16.472410 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:40:21.545473 543705 disk_info.go:125] begin check local disk info of client
I0319 23:40:21.547941 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:40:21.547948 543705 disk_info.go:196] parse disk info done, disk is : [0xc000492280 0xc0004922c0]
E0319 23:40:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:40:23.409760 543705 memory.go:184] no items to output this cycle
I0319 23:40:23.409795 543705 cpu.go:275] no items to output this cycle
E0319 23:40:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:40:33.409801 543705 memory.go:184] no items to output this cycle
I0319 23:40:33.409809 543705 cpu.go:275] no items to output this cycle
E0319 23:40:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:40:43.409824 543705 memory.go:191] Add success.
I0319 23:40:43.409829 543705 cpu.go:282] Add success.
I0319 23:40:43.419976 543705 net.go:648] Add success.
I0319 23:40:43.422743 543705 net.go:770] primary dev: ETH0
I0319 23:40:43.422756 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:40:43.422770 543705 net.go:698] Add success.
I0319 23:40:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:40:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:40:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:40:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:40:53.409777 543705 memory.go:184] no items to output this cycle
I0319 23:40:53.409779 543705 cpu.go:275] no items to output this cycle
E0319 23:41:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:41:03.409795 543705 memory.go:184] no items to output this cycle
I0319 23:41:03.409798 543705 cpu.go:275] no items to output this cycle
E0319 23:41:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:41:13.409819 543705 memory.go:191] Add success.
I0319 23:41:13.409822 543705 cpu.go:282] Add success.
W0319 23:41:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:41:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:41:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:41:13.420282 543705 net.go:648] Add success.
I0319 23:41:13.423256 543705 net.go:770] primary dev: ETH0
I0319 23:41:13.423271 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:41:13.423285 543705 net.go:698] Add success.
I0319 23:41:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:41:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:41:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0319 23:41:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:41:14.456518 543705 disk_worker.go:494] system disk:vda1
I0319 23:41:14.456566 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:41:15.455949 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:41:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:41:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:41:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:41:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:41:21.548380 543705 disk_info.go:125] begin check local disk info of client
I0319 23:41:21.550861 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:41:21.550868 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004963c0 0xc000496400]
E0319 23:41:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:41:23.409767 543705 memory.go:184] no items to output this cycle
I0319 23:41:23.409774 543705 cpu.go:275] no items to output this cycle
E0319 23:41:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:41:33.409800 543705 memory.go:184] no items to output this cycle
I0319 23:41:33.409814 543705 cpu.go:275] no items to output this cycle
E0319 23:41:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:41:43.409791 543705 memory.go:191] Add success.
I0319 23:41:43.409811 543705 cpu.go:282] Add success.
I0319 23:41:43.419954 543705 net.go:648] Add success.
I0319 23:41:43.422556 543705 net.go:770] primary dev: ETH0
I0319 23:41:43.422572 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:41:43.422586 543705 net.go:698] Add success.
I0319 23:41:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:41:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:41:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:41:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:41:53.409772 543705 memory.go:184] no items to output this cycle
I0319 23:41:53.409780 543705 cpu.go:275] no items to output this cycle
E0319 23:42:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:42:03.409776 543705 memory.go:184] no items to output this cycle
I0319 23:42:03.409780 543705 cpu.go:275] no items to output this cycle
E0319 23:42:13.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:42:13.409826 543705 memory.go:191] Add success.
I0319 23:42:13.409837 543705 cpu.go:282] Add success.
W0319 23:42:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:42:13.409875 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:42:13.409879 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:42:13.420155 543705 net.go:648] Add success.
I0319 23:42:13.422877 543705 net.go:770] primary dev: ETH0
I0319 23:42:13.422890 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:42:13.422902 543705 net.go:698] Add success.
I0319 23:42:13.468619 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"bbe68286-3cbe-484e-a696-cd65f799411c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 23:42:13.468654 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 23:42:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:42:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0319 23:42:14.455183 543705 disk_worker.go:728] disk inode is not compliant
E0319 23:42:14.456890 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 23:42:14.456899 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 23:42:14.456905 543705 custom_config.go:64] query custom config with name: gpu
I0319 23:42:14.456908 543705 disk_worker.go:494] system disk:vda1
I0319 23:42:14.456943 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 23:42:15.456877 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 23:42:15.456887 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:42:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 23:42:16.457975 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 23:42:16.458017 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:42:16.458033 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:42:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:42:21.551501 543705 disk_info.go:125] begin check local disk info of client
I0319 23:42:21.554025 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:42:21.554031 543705 disk_info.go:196] parse disk info done, disk is : [0xc000496340 0xc000496380]
E0319 23:42:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:42:23.409774 543705 memory.go:184] no items to output this cycle
I0319 23:42:23.409782 543705 cpu.go:275] no items to output this cycle
E0319 23:42:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:42:33.409763 543705 memory.go:184] no items to output this cycle
I0319 23:42:33.409796 543705 cpu.go:275] no items to output this cycle
I0319 23:42:37.965737 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 23:42:37.965743 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 23:42:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:42:43.410700 543705 memory.go:191] Add success.
I0319 23:42:43.409808 543705 cpu.go:282] Add success.
I0319 23:42:43.420424 543705 net.go:648] Add success.
I0319 23:42:43.423194 543705 net.go:770] primary dev: ETH0
I0319 23:42:43.423208 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:42:43.423219 543705 net.go:698] Add success.
I0319 23:42:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:42:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:42:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:42:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:42:53.409768 543705 memory.go:184] no items to output this cycle
I0319 23:42:53.409801 543705 cpu.go:275] no items to output this cycle
E0319 23:43:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:43:03.409796 543705 memory.go:184] no items to output this cycle
I0319 23:43:03.409821 543705 cpu.go:275] no items to output this cycle
E0319 23:43:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:43:13.409799 543705 memory.go:191] Add success.
I0319 23:43:13.409816 543705 cpu.go:282] Add success.
W0319 23:43:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:43:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:43:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:43:13.420076 543705 net.go:648] Add success.
I0319 23:43:13.422841 543705 net.go:770] primary dev: ETH0
I0319 23:43:13.422855 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:43:13.422870 543705 net.go:698] Add success.
I0319 23:43:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:43:14.455168 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:43:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0319 23:43:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:43:14.456513 543705 disk_worker.go:494] system disk:vda1
I0319 23:43:14.456560 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:43:15.456013 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:43:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:43:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:43:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:43:16.472443 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:43:21.554479 543705 disk_info.go:125] begin check local disk info of client
I0319 23:43:21.556946 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:43:21.556953 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b2340 0xc0003b2380]
E0319 23:43:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:43:23.409796 543705 memory.go:184] no items to output this cycle
I0319 23:43:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 23:43:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:43:33.409775 543705 memory.go:184] no items to output this cycle
I0319 23:43:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 23:43:43.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:43:43.409837 543705 memory.go:191] Add success.
I0319 23:43:43.409847 543705 cpu.go:282] Add success.
I0319 23:43:43.419969 543705 net.go:648] Add success.
I0319 23:43:43.422970 543705 net.go:770] primary dev: ETH0
I0319 23:43:43.422984 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:43:43.422997 543705 net.go:698] Add success.
I0319 23:43:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:43:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:43:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:43:53.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:43:53.409811 543705 memory.go:184] no items to output this cycle
I0319 23:43:53.409824 543705 cpu.go:275] no items to output this cycle
E0319 23:44:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:44:03.409789 543705 memory.go:184] no items to output this cycle
I0319 23:44:03.409791 543705 cpu.go:275] no items to output this cycle
E0319 23:44:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:44:13.409799 543705 memory.go:191] Add success.
I0319 23:44:13.409819 543705 cpu.go:282] Add success.
W0319 23:44:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:44:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:44:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:44:13.420165 543705 net.go:648] Add success.
I0319 23:44:13.422664 543705 net.go:770] primary dev: ETH0
I0319 23:44:13.422677 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:44:13.422689 543705 net.go:698] Add success.
I0319 23:44:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:44:14.455100 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:44:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0319 23:44:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:44:14.456561 543705 disk_worker.go:494] system disk:vda1
I0319 23:44:14.456593 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:44:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:44:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:44:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:44:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:44:16.472379 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:44:21.557426 543705 disk_info.go:125] begin check local disk info of client
I0319 23:44:21.559898 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:44:21.559906 543705 disk_info.go:196] parse disk info done, disk is : [0xc000492200 0xc000492240]
E0319 23:44:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:44:23.409774 543705 memory.go:184] no items to output this cycle
I0319 23:44:23.409779 543705 cpu.go:275] no items to output this cycle
E0319 23:44:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:44:33.409815 543705 memory.go:184] no items to output this cycle
I0319 23:44:33.409833 543705 cpu.go:275] no items to output this cycle
E0319 23:44:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:44:43.409791 543705 memory.go:191] Add success.
I0319 23:44:43.409811 543705 cpu.go:282] Add success.
I0319 23:44:43.420006 543705 net.go:648] Add success.
I0319 23:44:43.422882 543705 net.go:770] primary dev: ETH0
I0319 23:44:43.422896 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:44:43.422910 543705 net.go:698] Add success.
I0319 23:44:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:44:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:44:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:44:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:44:53.409795 543705 memory.go:184] no items to output this cycle
I0319 23:44:53.409807 543705 cpu.go:275] no items to output this cycle
E0319 23:45:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:45:03.409782 543705 memory.go:184] no items to output this cycle
I0319 23:45:03.409816 543705 cpu.go:275] no items to output this cycle
E0319 23:45:13.409802 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:45:13.409843 543705 memory.go:191] Add success.
I0319 23:45:13.409855 543705 cpu.go:282] Add success.
W0319 23:45:13.409889 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:45:13.409905 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:45:13.409909 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:45:13.420263 543705 net.go:648] Add success.
I0319 23:45:13.423048 543705 net.go:770] primary dev: ETH0
I0319 23:45:13.423063 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:45:13.423077 543705 net.go:698] Add success.
I0319 23:45:13.476791 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"6800e00e-185b-4406-8b81-954b2ed2088e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 23:45:13.476822 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 23:45:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:45:14.455144 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:45:14.455228 543705 disk_worker.go:708] disk space is not compliant
W0319 23:45:14.455231 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:45:14.456742 543705 disk_worker.go:494] system disk:vda1
I0319 23:45:14.456775 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:45:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:45:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:45:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:45:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:45:16.472376 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:45:21.560543 543705 disk_info.go:125] begin check local disk info of client
I0319 23:45:21.563037 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:45:21.563043 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002707c0 0xc000270800]
E0319 23:45:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:45:23.409769 543705 memory.go:184] no items to output this cycle
I0319 23:45:23.409778 543705 cpu.go:275] no items to output this cycle
E0319 23:45:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:45:33.409777 543705 memory.go:184] no items to output this cycle
I0319 23:45:33.409809 543705 cpu.go:275] no items to output this cycle
I0319 23:45:37.965882 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 23:45:37.965888 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 23:45:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:45:43.410655 543705 memory.go:191] Add success.
I0319 23:45:43.409836 543705 cpu.go:282] Add success.
I0319 23:45:43.420437 543705 net.go:648] Add success.
I0319 23:45:43.423402 543705 net.go:770] primary dev: ETH0
I0319 23:45:43.423416 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:45:43.423429 543705 net.go:698] Add success.
I0319 23:45:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:45:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:45:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:45:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:45:53.409796 543705 memory.go:184] no items to output this cycle
I0319 23:45:53.409808 543705 cpu.go:275] no items to output this cycle
E0319 23:46:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:46:03.409794 543705 memory.go:184] no items to output this cycle
I0319 23:46:03.409810 543705 cpu.go:275] no items to output this cycle
E0319 23:46:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:46:13.409800 543705 memory.go:191] Add success.
W0319 23:46:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 23:46:13.409831 543705 cpu.go:282] Add success.
W0319 23:46:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:46:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:46:13.420634 543705 net.go:648] Add success.
I0319 23:46:13.423230 543705 net.go:770] primary dev: ETH0
I0319 23:46:13.423243 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:46:13.423255 543705 net.go:698] Add success.
I0319 23:46:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:46:14.455116 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:46:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0319 23:46:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:46:14.456582 543705 disk_worker.go:494] system disk:vda1
I0319 23:46:14.456612 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:46:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:46:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:46:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:46:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:46:16.472355 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:46:21.563458 543705 disk_info.go:125] begin check local disk info of client
I0319 23:46:21.565901 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:46:21.565908 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5440 0xc0000c5480]
E0319 23:46:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:46:23.409790 543705 memory.go:184] no items to output this cycle
I0319 23:46:23.409797 543705 cpu.go:275] no items to output this cycle
E0319 23:46:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:46:33.409807 543705 memory.go:184] no items to output this cycle
I0319 23:46:33.409818 543705 cpu.go:275] no items to output this cycle
E0319 23:46:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:46:43.409824 543705 memory.go:191] Add success.
I0319 23:46:43.409829 543705 cpu.go:282] Add success.
I0319 23:46:43.419998 543705 net.go:648] Add success.
I0319 23:46:43.422673 543705 net.go:770] primary dev: ETH0
I0319 23:46:43.422688 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:46:43.422703 543705 net.go:698] Add success.
I0319 23:46:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:46:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:46:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:46:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:46:53.409781 543705 memory.go:184] no items to output this cycle
I0319 23:46:53.409786 543705 cpu.go:275] no items to output this cycle
E0319 23:47:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:47:03.409782 543705 memory.go:184] no items to output this cycle
I0319 23:47:03.409820 543705 cpu.go:275] no items to output this cycle
E0319 23:47:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:47:13.409793 543705 memory.go:191] Add success.
I0319 23:47:13.409800 543705 cpu.go:282] Add success.
W0319 23:47:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:47:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:47:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:47:13.420089 543705 net.go:648] Add success.
I0319 23:47:13.423028 543705 net.go:770] primary dev: ETH0
I0319 23:47:13.423042 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:47:13.423055 543705 net.go:698] Add success.
I0319 23:47:13.453638 543705 event_worker.go:152] Polling the log file for events...
W0319 23:47:14.455126 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:47:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0319 23:47:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:47:14.456812 543705 disk_worker.go:494] system disk:vda1
I0319 23:47:14.456853 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 23:47:14.457126 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 23:47:14.457134 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 23:47:14.457138 543705 custom_config.go:64] query custom config with name: gpu
E0319 23:47:15.456821 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 23:47:15.456828 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:47:16.457962 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 23:47:16.457972 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 23:47:16.458016 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:47:16.458034 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:47:16.472376 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:47:21.566464 543705 disk_info.go:125] begin check local disk info of client
I0319 23:47:21.568940 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:47:21.568947 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037e800 0xc00037e840]
E0319 23:47:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:47:23.409756 543705 memory.go:184] no items to output this cycle
I0319 23:47:23.409793 543705 cpu.go:275] no items to output this cycle
E0319 23:47:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:47:33.409781 543705 memory.go:184] no items to output this cycle
I0319 23:47:33.409798 543705 cpu.go:275] no items to output this cycle
E0319 23:47:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:47:43.409821 543705 memory.go:191] Add success.
I0319 23:47:43.409838 543705 cpu.go:282] Add success.
I0319 23:47:43.420013 543705 net.go:648] Add success.
I0319 23:47:43.422763 543705 net.go:770] primary dev: ETH0
I0319 23:47:43.422779 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:47:43.422794 543705 net.go:698] Add success.
I0319 23:47:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:47:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:47:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:47:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:47:53.409765 543705 memory.go:184] no items to output this cycle
I0319 23:47:53.409795 543705 cpu.go:275] no items to output this cycle
E0319 23:48:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:48:03.409804 543705 memory.go:184] no items to output this cycle
I0319 23:48:03.409814 543705 cpu.go:275] no items to output this cycle
E0319 23:48:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:48:13.409786 543705 memory.go:191] Add success.
W0319 23:48:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0319 23:48:13.409814 543705 cpu.go:282] Add success.
W0319 23:48:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:48:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:48:13.420052 543705 net.go:648] Add success.
I0319 23:48:13.423234 543705 net.go:770] primary dev: ETH0
I0319 23:48:13.423248 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:48:13.423260 543705 net.go:698] Add success.
I0319 23:48:13.463214 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5abb5a52-6be4-403e-8d39-ceffa92cb844","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 23:48:13.463243 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 23:48:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:48:14.455168 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:48:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0319 23:48:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:48:14.456671 543705 disk_worker.go:494] system disk:vda1
I0319 23:48:14.456703 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:48:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:48:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:48:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:48:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:48:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:48:21.569600 543705 disk_info.go:125] begin check local disk info of client
I0319 23:48:21.571898 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:48:21.571905 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a1980 0xc0002a19c0]
E0319 23:48:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:48:23.409782 543705 memory.go:184] no items to output this cycle
I0319 23:48:23.409793 543705 cpu.go:275] no items to output this cycle
E0319 23:48:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:48:33.409787 543705 memory.go:184] no items to output this cycle
I0319 23:48:33.409809 543705 cpu.go:275] no items to output this cycle
I0319 23:48:37.966026 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 23:48:37.966032 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 23:48:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:48:43.410705 543705 memory.go:191] Add success.
I0319 23:48:43.409827 543705 cpu.go:282] Add success.
I0319 23:48:43.420449 543705 net.go:648] Add success.
I0319 23:48:43.423244 543705 net.go:770] primary dev: ETH0
I0319 23:48:43.423260 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:48:43.423275 543705 net.go:698] Add success.
I0319 23:48:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:48:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:48:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:48:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:48:53.409766 543705 memory.go:184] no items to output this cycle
I0319 23:48:53.409798 543705 cpu.go:275] no items to output this cycle
E0319 23:49:03.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:49:03.409816 543705 memory.go:184] no items to output this cycle
I0319 23:49:03.409838 543705 cpu.go:275] no items to output this cycle
E0319 23:49:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:49:13.409805 543705 memory.go:191] Add success.
I0319 23:49:13.409806 543705 cpu.go:282] Add success.
W0319 23:49:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:49:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:49:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:49:13.420274 543705 net.go:648] Add success.
I0319 23:49:13.422932 543705 net.go:770] primary dev: ETH0
I0319 23:49:13.422946 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:49:13.422961 543705 net.go:698] Add success.
I0319 23:49:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:49:14.455202 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:49:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0319 23:49:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:49:14.456583 543705 disk_worker.go:494] system disk:vda1
I0319 23:49:14.456614 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:49:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:49:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:49:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:49:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:49:16.472431 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:49:21.572499 543705 disk_info.go:125] begin check local disk info of client
I0319 23:49:21.574931 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:49:21.574937 543705 disk_info.go:196] parse disk info done, disk is : [0xc000343c40 0xc000343c80]
E0319 23:49:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:49:23.409783 543705 memory.go:184] no items to output this cycle
I0319 23:49:23.409797 543705 cpu.go:275] no items to output this cycle
I0319 23:49:33.409927 543705 cpu.go:275] no items to output this cycle
E0319 23:49:33.409927 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:49:33.409975 543705 memory.go:184] no items to output this cycle
E0319 23:49:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:49:43.409808 543705 memory.go:191] Add success.
I0319 23:49:43.409807 543705 cpu.go:282] Add success.
I0319 23:49:43.419995 543705 net.go:648] Add success.
I0319 23:49:43.422699 543705 net.go:770] primary dev: ETH0
I0319 23:49:43.422713 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:49:43.422726 543705 net.go:698] Add success.
I0319 23:49:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:49:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:49:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:49:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:49:53.409794 543705 memory.go:184] no items to output this cycle
I0319 23:49:53.409804 543705 cpu.go:275] no items to output this cycle
E0319 23:50:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:50:03.409779 543705 memory.go:184] no items to output this cycle
I0319 23:50:03.409781 543705 cpu.go:275] no items to output this cycle
E0319 23:50:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:50:13.409808 543705 memory.go:191] Add success.
I0319 23:50:13.409815 543705 cpu.go:282] Add success.
W0319 23:50:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:50:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:50:13.409851 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:50:13.420109 543705 net.go:648] Add success.
I0319 23:50:13.422921 543705 net.go:770] primary dev: ETH0
I0319 23:50:13.422944 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:50:13.422957 543705 net.go:698] Add success.
I0319 23:50:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:50:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:50:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0319 23:50:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:50:14.456604 543705 disk_worker.go:494] system disk:vda1
I0319 23:50:14.456635 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:50:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:50:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:50:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:50:16.458046 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:50:16.472359 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:50:21.575617 543705 disk_info.go:125] begin check local disk info of client
I0319 23:50:21.578105 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:50:21.578111 543705 disk_info.go:196] parse disk info done, disk is : [0xc000256900 0xc000256940]
E0319 23:50:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:50:23.409774 543705 memory.go:184] no items to output this cycle
I0319 23:50:23.409779 543705 cpu.go:275] no items to output this cycle
E0319 23:50:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:50:33.409796 543705 memory.go:184] no items to output this cycle
I0319 23:50:33.409808 543705 cpu.go:275] no items to output this cycle
E0319 23:50:43.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:50:43.409830 543705 memory.go:191] Add success.
I0319 23:50:43.409837 543705 cpu.go:282] Add success.
I0319 23:50:43.419816 543705 net.go:770] primary dev: ETH0
I0319 23:50:43.419830 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:50:43.419845 543705 net.go:698] Add success.
I0319 23:50:43.420224 543705 net.go:648] Add success.
I0319 23:50:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:50:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:50:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:50:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:50:53.409794 543705 memory.go:184] no items to output this cycle
I0319 23:50:53.409805 543705 cpu.go:275] no items to output this cycle
E0319 23:51:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:51:03.409788 543705 memory.go:184] no items to output this cycle
I0319 23:51:03.409828 543705 cpu.go:275] no items to output this cycle
E0319 23:51:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:51:13.409805 543705 memory.go:191] Add success.
I0319 23:51:13.409809 543705 cpu.go:282] Add success.
W0319 23:51:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:51:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:51:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:51:13.420213 543705 net.go:648] Add success.
I0319 23:51:13.422789 543705 net.go:770] primary dev: ETH0
I0319 23:51:13.422805 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:51:13.422818 543705 net.go:698] Add success.
I0319 23:51:13.469850 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"20f4d597-6ef6-4101-a2c4-8806639a6c96","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 23:51:13.469883 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 23:51:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:51:14.455223 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:51:14.455233 543705 disk_worker.go:708] disk space is not compliant
W0319 23:51:14.455236 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:51:14.456763 543705 disk_worker.go:494] system disk:vda1
I0319 23:51:14.456795 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:51:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:51:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:51:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:51:16.458078 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:51:16.472446 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:51:21.578532 543705 disk_info.go:125] begin check local disk info of client
I0319 23:51:21.581056 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:51:21.581062 543705 disk_info.go:196] parse disk info done, disk is : [0xc000256640 0xc000256680]
E0319 23:51:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:51:23.409768 543705 memory.go:184] no items to output this cycle
I0319 23:51:23.409803 543705 cpu.go:275] no items to output this cycle
E0319 23:51:33.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:51:33.409765 543705 memory.go:184] no items to output this cycle
I0319 23:51:33.409801 543705 cpu.go:275] no items to output this cycle
I0319 23:51:37.966835 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 23:51:37.966841 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 23:51:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:51:43.410623 543705 memory.go:191] Add success.
I0319 23:51:43.409817 543705 cpu.go:282] Add success.
I0319 23:51:43.420394 543705 net.go:648] Add success.
I0319 23:51:43.422978 543705 net.go:770] primary dev: ETH0
I0319 23:51:43.422990 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:51:43.423003 543705 net.go:698] Add success.
I0319 23:51:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:51:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:51:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:51:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:51:53.409779 543705 cpu.go:275] no items to output this cycle
I0319 23:51:53.409782 543705 memory.go:184] no items to output this cycle
E0319 23:52:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:52:03.409781 543705 memory.go:184] no items to output this cycle
I0319 23:52:03.409784 543705 cpu.go:275] no items to output this cycle
W0319 23:52:13.409707 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:52:13.409724 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:52:13.409728 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0319 23:52:13.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:52:13.409818 543705 memory.go:191] Add success.
I0319 23:52:13.409827 543705 cpu.go:282] Add success.
I0319 23:52:13.420052 543705 net.go:648] Add success.
I0319 23:52:13.423261 543705 net.go:770] primary dev: ETH0
I0319 23:52:13.423274 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:52:13.423288 543705 net.go:698] Add success.
W0319 23:52:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:52:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0319 23:52:14.455168 543705 disk_worker.go:728] disk inode is not compliant
E0319 23:52:14.456854 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 23:52:14.456863 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 23:52:14.456870 543705 custom_config.go:64] query custom config with name: gpu
I0319 23:52:14.456942 543705 disk_worker.go:494] system disk:vda1
I0319 23:52:14.456985 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 23:52:15.456866 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 23:52:15.456876 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:52:16.457921 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 23:52:16.457929 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 23:52:16.457975 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:52:16.457991 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:52:16.472325 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:52:21.581597 543705 disk_info.go:125] begin check local disk info of client
I0319 23:52:21.583976 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:52:21.583982 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a8680 0xc0004a86c0]
E0319 23:52:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:52:23.409788 543705 memory.go:184] no items to output this cycle
I0319 23:52:23.409800 543705 cpu.go:275] no items to output this cycle
E0319 23:52:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:52:33.409792 543705 memory.go:184] no items to output this cycle
I0319 23:52:33.409805 543705 cpu.go:275] no items to output this cycle
E0319 23:52:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:52:43.409800 543705 cpu.go:282] Add success.
I0319 23:52:43.409805 543705 memory.go:191] Add success.
I0319 23:52:43.420066 543705 net.go:648] Add success.
I0319 23:52:43.422823 543705 net.go:770] primary dev: ETH0
I0319 23:52:43.422836 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:52:43.422848 543705 net.go:698] Add success.
I0319 23:52:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:52:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:52:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:52:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:52:53.409798 543705 memory.go:184] no items to output this cycle
I0319 23:52:53.409813 543705 cpu.go:275] no items to output this cycle
E0319 23:53:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:53:03.409775 543705 memory.go:184] no items to output this cycle
I0319 23:53:03.409855 543705 cpu.go:275] no items to output this cycle
E0319 23:53:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:53:13.409797 543705 memory.go:191] Add success.
I0319 23:53:13.409798 543705 cpu.go:282] Add success.
W0319 23:53:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:53:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:53:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:53:13.420149 543705 net.go:648] Add success.
I0319 23:53:13.422893 543705 net.go:770] primary dev: ETH0
I0319 23:53:13.422906 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:53:13.422919 543705 net.go:698] Add success.
I0319 23:53:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:53:14.455106 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:53:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0319 23:53:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:53:14.456543 543705 disk_worker.go:494] system disk:vda1
I0319 23:53:14.456574 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:53:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:53:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:53:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:53:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:53:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:53:21.584064 543705 disk_info.go:125] begin check local disk info of client
I0319 23:53:21.586566 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:53:21.586573 543705 disk_info.go:196] parse disk info done, disk is : [0xc000384b40 0xc000384b80]
E0319 23:53:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:53:23.409778 543705 memory.go:184] no items to output this cycle
I0319 23:53:23.409807 543705 cpu.go:275] no items to output this cycle
E0319 23:53:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:53:33.409772 543705 memory.go:184] no items to output this cycle
I0319 23:53:33.409796 543705 cpu.go:275] no items to output this cycle
E0319 23:53:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:53:43.409819 543705 memory.go:191] Add success.
I0319 23:53:43.409828 543705 cpu.go:282] Add success.
I0319 23:53:43.420140 543705 net.go:648] Add success.
I0319 23:53:43.423134 543705 net.go:770] primary dev: ETH0
I0319 23:53:43.423150 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:53:43.423163 543705 net.go:698] Add success.
I0319 23:53:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:53:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:53:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:53:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:53:53.409778 543705 memory.go:184] no items to output this cycle
I0319 23:53:53.409781 543705 cpu.go:275] no items to output this cycle
E0319 23:54:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:54:03.409784 543705 memory.go:184] no items to output this cycle
I0319 23:54:03.409793 543705 cpu.go:275] no items to output this cycle
E0319 23:54:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:54:13.409798 543705 memory.go:191] Add success.
I0319 23:54:13.409799 543705 cpu.go:282] Add success.
W0319 23:54:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:54:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:54:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:54:13.420121 543705 net.go:648] Add success.
I0319 23:54:13.422961 543705 net.go:770] primary dev: ETH0
I0319 23:54:13.422975 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:54:13.422987 543705 net.go:698] Add success.
I0319 23:54:13.469413 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"6a75e560-5edf-44a9-b042-f827bdbde01c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 23:54:13.469446 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0319 23:54:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:54:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:54:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0319 23:54:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:54:14.456633 543705 disk_worker.go:494] system disk:vda1
I0319 23:54:14.456663 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:54:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:54:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:54:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:54:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:54:16.472385 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:54:21.586626 543705 disk_info.go:125] begin check local disk info of client
I0319 23:54:21.589058 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:54:21.589064 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004b4c00 0xc0004b4c40]
E0319 23:54:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:54:23.409786 543705 memory.go:184] no items to output this cycle
I0319 23:54:23.409797 543705 cpu.go:275] no items to output this cycle
E0319 23:54:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:54:33.409768 543705 memory.go:184] no items to output this cycle
I0319 23:54:33.409788 543705 cpu.go:275] no items to output this cycle
I0319 23:54:37.966984 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 23:54:37.966990 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 23:54:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:54:43.410540 543705 memory.go:191] Add success.
I0319 23:54:43.409803 543705 cpu.go:282] Add success.
I0319 23:54:43.420587 543705 net.go:648] Add success.
I0319 23:54:43.423188 543705 net.go:770] primary dev: ETH0
I0319 23:54:43.423201 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:54:43.423213 543705 net.go:698] Add success.
I0319 23:54:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:54:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:54:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:54:53.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:54:53.409760 543705 memory.go:184] no items to output this cycle
I0319 23:54:53.409798 543705 cpu.go:275] no items to output this cycle
I0319 23:55:03.409809 543705 cpu.go:275] no items to output this cycle
E0319 23:55:03.409809 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:55:03.409831 543705 memory.go:184] no items to output this cycle
E0319 23:55:13.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:55:13.409827 543705 memory.go:191] Add success.
I0319 23:55:13.409827 543705 cpu.go:282] Add success.
W0319 23:55:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:55:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:55:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:55:13.420229 543705 net.go:648] Add success.
I0319 23:55:13.422841 543705 net.go:770] primary dev: ETH0
I0319 23:55:13.422855 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:55:13.422866 543705 net.go:698] Add success.
I0319 23:55:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:55:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:55:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0319 23:55:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:55:14.456501 543705 disk_worker.go:494] system disk:vda1
I0319 23:55:14.456543 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:55:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:55:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:55:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:55:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:55:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:55:21.589589 543705 disk_info.go:125] begin check local disk info of client
I0319 23:55:21.592155 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:55:21.592161 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004b5000 0xc0004b5040]
E0319 23:55:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:55:23.409767 543705 memory.go:184] no items to output this cycle
I0319 23:55:23.409778 543705 cpu.go:275] no items to output this cycle
E0319 23:55:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:55:33.409795 543705 memory.go:184] no items to output this cycle
I0319 23:55:33.409810 543705 cpu.go:275] no items to output this cycle
E0319 23:55:43.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:55:43.409830 543705 memory.go:191] Add success.
I0319 23:55:43.409833 543705 cpu.go:282] Add success.
I0319 23:55:43.420144 543705 net.go:648] Add success.
I0319 23:55:43.422843 543705 net.go:770] primary dev: ETH0
I0319 23:55:43.422856 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:55:43.422868 543705 net.go:698] Add success.
I0319 23:55:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:55:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:55:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:55:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:55:53.409782 543705 memory.go:184] no items to output this cycle
I0319 23:55:53.409783 543705 cpu.go:275] no items to output this cycle
E0319 23:56:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:56:03.409783 543705 memory.go:184] no items to output this cycle
I0319 23:56:03.409787 543705 cpu.go:275] no items to output this cycle
E0319 23:56:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:56:13.409793 543705 memory.go:191] Add success.
I0319 23:56:13.409796 543705 cpu.go:282] Add success.
W0319 23:56:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:56:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:56:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:56:13.420265 543705 net.go:648] Add success.
I0319 23:56:13.422877 543705 net.go:770] primary dev: ETH0
I0319 23:56:13.422895 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:56:13.422910 543705 net.go:698] Add success.
I0319 23:56:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:56:14.455165 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:56:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0319 23:56:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:56:14.456571 543705 disk_worker.go:494] system disk:vda1
I0319 23:56:14.456600 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:56:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:56:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:56:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:56:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:56:16.472386 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:56:21.592603 543705 disk_info.go:125] begin check local disk info of client
I0319 23:56:21.595036 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:56:21.595042 543705 disk_info.go:196] parse disk info done, disk is : [0xc000580d40 0xc000580d80]
E0319 23:56:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:56:23.409787 543705 memory.go:184] no items to output this cycle
I0319 23:56:23.409798 543705 cpu.go:275] no items to output this cycle
E0319 23:56:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:56:33.409767 543705 memory.go:184] no items to output this cycle
I0319 23:56:33.409802 543705 cpu.go:275] no items to output this cycle
E0319 23:56:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:56:43.409780 543705 memory.go:191] Add success.
I0319 23:56:43.409817 543705 cpu.go:282] Add success.
I0319 23:56:43.419963 543705 net.go:648] Add success.
I0319 23:56:43.422430 543705 net.go:770] primary dev: ETH0
I0319 23:56:43.422446 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:56:43.422461 543705 net.go:698] Add success.
I0319 23:56:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:56:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:56:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:56:53.409841 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:56:53.409858 543705 memory.go:184] no items to output this cycle
I0319 23:56:53.409961 543705 cpu.go:275] no items to output this cycle
E0319 23:57:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:57:03.409801 543705 memory.go:184] no items to output this cycle
I0319 23:57:03.409823 543705 cpu.go:275] no items to output this cycle
E0319 23:57:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:57:13.409818 543705 memory.go:191] Add success.
I0319 23:57:13.409826 543705 cpu.go:282] Add success.
W0319 23:57:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:57:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:57:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:57:13.420152 543705 net.go:648] Add success.
I0319 23:57:13.422819 543705 net.go:770] primary dev: ETH0
I0319 23:57:13.422833 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:57:13.422847 543705 net.go:698] Add success.
I0319 23:57:13.429111 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0319 23:57:13.453284 543705 event_worker.go:152] Polling the log file for events...
I0319 23:57:13.485844 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b1019dc0-d3f0-413a-ae08-1c4fc92ed21a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0319 23:57:13.485876 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0319 23:57:14.455124 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:57:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0319 23:57:14.455189 543705 disk_worker.go:728] disk inode is not compliant
E0319 23:57:14.455858 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0319 23:57:14.455867 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0319 23:57:14.455871 543705 custom_config.go:64] query custom config with name: gpu
I0319 23:57:14.456621 543705 disk_worker.go:494] system disk:vda1
I0319 23:57:14.456667 543705 disk_worker.go:432] add disk info successfully, len:33
E0319 23:57:15.456849 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0319 23:57:15.456857 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:57:16.457941 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0319 23:57:16.457950 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0319 23:57:16.457992 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:57:16.458009 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:57:16.472328 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:57:21.595676 543705 disk_info.go:125] begin check local disk info of client
I0319 23:57:21.598114 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:57:21.598120 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003844c0 0xc000384500]
E0319 23:57:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:57:23.409765 543705 memory.go:184] no items to output this cycle
I0319 23:57:23.409792 543705 cpu.go:275] no items to output this cycle
E0319 23:57:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:57:33.409764 543705 memory.go:184] no items to output this cycle
I0319 23:57:33.409804 543705 cpu.go:275] no items to output this cycle
I0319 23:57:37.969542 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0319 23:57:37.969548 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0319 23:57:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:57:43.410709 543705 memory.go:191] Add success.
I0319 23:57:43.409813 543705 cpu.go:282] Add success.
I0319 23:57:43.420498 543705 net.go:648] Add success.
I0319 23:57:43.423608 543705 net.go:770] primary dev: ETH0
I0319 23:57:43.423635 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:57:43.423647 543705 net.go:698] Add success.
I0319 23:57:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:57:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:57:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:57:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:57:53.409782 543705 memory.go:184] no items to output this cycle
I0319 23:57:53.409781 543705 cpu.go:275] no items to output this cycle
E0319 23:58:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:58:03.409789 543705 memory.go:184] no items to output this cycle
I0319 23:58:03.409802 543705 cpu.go:275] no items to output this cycle
E0319 23:58:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:58:13.409793 543705 memory.go:191] Add success.
I0319 23:58:13.409810 543705 cpu.go:282] Add success.
W0319 23:58:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:58:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:58:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:58:13.420089 543705 net.go:648] Add success.
I0319 23:58:13.422857 543705 net.go:770] primary dev: ETH0
I0319 23:58:13.422872 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:58:13.422886 543705 net.go:698] Add success.
I0319 23:58:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:58:14.455137 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:58:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0319 23:58:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:58:14.456576 543705 disk_worker.go:494] system disk:vda1
I0319 23:58:14.456605 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:58:15.455975 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:58:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:58:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:58:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:58:16.472418 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:58:21.598689 543705 disk_info.go:125] begin check local disk info of client
I0319 23:58:21.601134 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:58:21.601141 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035f780 0xc00035f7c0]
E0319 23:58:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:58:23.409785 543705 cpu.go:275] no items to output this cycle
I0319 23:58:23.409794 543705 memory.go:184] no items to output this cycle
E0319 23:58:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:58:33.409772 543705 memory.go:184] no items to output this cycle
I0319 23:58:33.409808 543705 cpu.go:275] no items to output this cycle
E0319 23:58:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:58:43.409803 543705 memory.go:191] Add success.
I0319 23:58:43.409811 543705 cpu.go:282] Add success.
I0319 23:58:43.420135 543705 net.go:648] Add success.
I0319 23:58:43.422845 543705 net.go:770] primary dev: ETH0
I0319 23:58:43.422861 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:58:43.422874 543705 net.go:698] Add success.
I0319 23:58:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:58:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:58:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:58:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:58:53.409788 543705 memory.go:184] no items to output this cycle
I0319 23:58:53.409808 543705 cpu.go:275] no items to output this cycle
E0319 23:59:03.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:59:03.409816 543705 memory.go:184] no items to output this cycle
I0319 23:59:03.409826 543705 cpu.go:275] no items to output this cycle
E0319 23:59:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:59:13.409793 543705 memory.go:191] Add success.
I0319 23:59:13.409815 543705 cpu.go:282] Add success.
W0319 23:59:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0319 23:59:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0319 23:59:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0319 23:59:13.420054 543705 net.go:648] Add success.
I0319 23:59:13.422820 543705 net.go:770] primary dev: ETH0
I0319 23:59:13.422833 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:59:13.422845 543705 net.go:698] Add success.
I0319 23:59:14.454952 543705 custom_config.go:64] query custom config with name: gpu
W0319 23:59:14.455124 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0319 23:59:14.455211 543705 disk_worker.go:708] disk space is not compliant
W0319 23:59:14.455214 543705 disk_worker.go:728] disk inode is not compliant
I0319 23:59:14.456563 543705 disk_worker.go:494] system disk:vda1
I0319 23:59:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
I0319 23:59:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0319 23:59:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:59:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:59:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0319 23:59:16.472380 543705 disk_local_worker.go:436] Get disk info: []
I0319 23:59:21.601673 543705 disk_info.go:125] begin check local disk info of client
I0319 23:59:21.604129 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0319 23:59:21.604135 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab440 0xc0001ab480]
E0319 23:59:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:59:23.409799 543705 memory.go:184] no items to output this cycle
I0319 23:59:23.409813 543705 cpu.go:275] no items to output this cycle
E0319 23:59:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:59:33.409769 543705 memory.go:184] no items to output this cycle
I0319 23:59:33.409800 543705 cpu.go:275] no items to output this cycle
E0319 23:59:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:59:43.409792 543705 memory.go:191] Add success.
I0319 23:59:43.409820 543705 cpu.go:282] Add success.
I0319 23:59:43.419730 543705 net.go:648] Add success.
I0319 23:59:43.422504 543705 net.go:770] primary dev: ETH0
I0319 23:59:43.422518 543705 net.go:802] Send network stats successfully!,count is 6
I0319 23:59:43.422530 543705 net.go:698] Add success.
I0319 23:59:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0319 23:59:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0319 23:59:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0319 23:59:53.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0319 23:59:53.409760 543705 memory.go:184] no items to output this cycle
I0319 23:59:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 00:00:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:00:03.409775 543705 memory.go:184] no items to output this cycle
I0320 00:00:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 00:00:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:00:13.409785 543705 memory.go:191] Add success.
I0320 00:00:13.409807 543705 cpu.go:282] Add success.
W0320 00:00:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:00:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:00:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:00:13.420212 543705 net.go:648] Add success.
I0320 00:00:13.422964 543705 net.go:770] primary dev: ETH0
I0320 00:00:13.422976 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:00:13.422988 543705 net.go:698] Add success.
I0320 00:00:13.497405 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"92b7482e-b6d5-4e00-a534-7bd74da40b41","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 00:00:13.497440 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 00:00:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:00:14.455116 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:00:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0320 00:00:14.455179 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:00:14.456510 543705 disk_worker.go:494] system disk:vda1
I0320 00:00:14.456555 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:00:15.455615 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:00:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:00:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:00:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:00:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:00:21.604719 543705 disk_info.go:125] begin check local disk info of client
I0320 00:00:21.607195 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:00:21.607201 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4fc0 0xc0000c5000]
E0320 00:00:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:00:23.409770 543705 memory.go:184] no items to output this cycle
I0320 00:00:23.409773 543705 cpu.go:275] no items to output this cycle
E0320 00:00:33.409844 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:00:33.409870 543705 memory.go:184] no items to output this cycle
I0320 00:00:33.409954 543705 cpu.go:275] no items to output this cycle
I0320 00:00:37.969729 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 00:00:37.969735 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 00:00:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:00:43.410641 543705 memory.go:191] Add success.
I0320 00:00:43.409817 543705 cpu.go:282] Add success.
I0320 00:00:43.420349 543705 net.go:648] Add success.
I0320 00:00:43.423175 543705 net.go:770] primary dev: ETH0
I0320 00:00:43.423191 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:00:43.423206 543705 net.go:698] Add success.
I0320 00:00:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:00:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:00:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:00:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:00:53.409791 543705 memory.go:184] no items to output this cycle
I0320 00:00:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 00:01:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:01:03.409776 543705 memory.go:184] no items to output this cycle
I0320 00:01:03.409795 543705 cpu.go:275] no items to output this cycle
W0320 00:01:13.409709 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:01:13.409732 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:01:13.409738 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0320 00:01:13.409830 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:01:13.409836 543705 cpu.go:282] Add success.
I0320 00:01:13.409851 543705 memory.go:191] Add success.
I0320 00:01:13.420178 543705 net.go:648] Add success.
I0320 00:01:13.422702 543705 net.go:770] primary dev: ETH0
I0320 00:01:13.422715 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:01:13.422727 543705 net.go:698] Add success.
I0320 00:01:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:01:14.455201 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:01:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0320 00:01:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:01:14.456573 543705 disk_worker.go:494] system disk:vda1
I0320 00:01:14.456605 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:01:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:01:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:01:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:01:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:01:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:01:21.607284 543705 disk_info.go:125] begin check local disk info of client
I0320 00:01:21.609778 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:01:21.609785 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003297c0 0xc000329800]
I0320 00:01:23.409849 543705 cpu.go:275] no items to output this cycle
E0320 00:01:23.409858 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:01:23.409871 543705 memory.go:184] no items to output this cycle
E0320 00:01:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:01:33.409795 543705 memory.go:184] no items to output this cycle
I0320 00:01:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 00:01:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:01:43.409788 543705 memory.go:191] Add success.
I0320 00:01:43.409823 543705 cpu.go:282] Add success.
I0320 00:01:43.419894 543705 net.go:648] Add success.
I0320 00:01:43.422712 543705 net.go:770] primary dev: ETH0
I0320 00:01:43.422725 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:01:43.422752 543705 net.go:698] Add success.
I0320 00:01:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:01:46.458065 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:01:46.458093 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:01:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:01:53.409803 543705 memory.go:184] no items to output this cycle
I0320 00:01:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 00:02:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:02:03.409805 543705 memory.go:184] no items to output this cycle
I0320 00:02:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 00:02:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:02:13.409785 543705 memory.go:191] Add success.
I0320 00:02:13.409803 543705 cpu.go:282] Add success.
W0320 00:02:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:02:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:02:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:02:13.420122 543705 net.go:648] Add success.
I0320 00:02:13.423053 543705 net.go:770] primary dev: ETH0
I0320 00:02:13.423068 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:02:13.423082 543705 net.go:698] Add success.
W0320 00:02:14.455130 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:02:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0320 00:02:14.455195 543705 disk_worker.go:728] disk inode is not compliant
E0320 00:02:14.456927 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 00:02:14.456936 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 00:02:14.456942 543705 custom_config.go:64] query custom config with name: gpu
I0320 00:02:14.457015 543705 disk_worker.go:494] system disk:vda1
I0320 00:02:14.457058 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 00:02:15.456778 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 00:02:15.456786 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:02:16.457961 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 00:02:16.457961 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 00:02:16.458015 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:02:16.458034 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:02:16.472363 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:02:21.610728 543705 disk_info.go:125] begin check local disk info of client
I0320 00:02:21.613226 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:02:21.613231 543705 disk_info.go:196] parse disk info done, disk is : [0xc000293140 0xc000293180]
E0320 00:02:23.409859 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:02:23.409876 543705 memory.go:184] no items to output this cycle
I0320 00:02:23.409946 543705 cpu.go:275] no items to output this cycle
E0320 00:02:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:02:33.409778 543705 memory.go:184] no items to output this cycle
I0320 00:02:33.409789 543705 cpu.go:275] no items to output this cycle
E0320 00:02:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:02:43.409807 543705 cpu.go:282] Add success.
I0320 00:02:43.409810 543705 memory.go:191] Add success.
I0320 00:02:43.419995 543705 net.go:648] Add success.
I0320 00:02:43.422753 543705 net.go:770] primary dev: ETH0
I0320 00:02:43.422769 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:02:43.422782 543705 net.go:698] Add success.
I0320 00:02:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:02:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:02:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:02:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:02:53.409783 543705 memory.go:184] no items to output this cycle
I0320 00:02:53.409784 543705 cpu.go:275] no items to output this cycle
E0320 00:03:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:03:03.409768 543705 memory.go:184] no items to output this cycle
I0320 00:03:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 00:03:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:03:13.409791 543705 memory.go:191] Add success.
W0320 00:03:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 00:03:13.409816 543705 cpu.go:282] Add success.
W0320 00:03:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:03:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:03:13.420139 543705 net.go:648] Add success.
I0320 00:03:13.423096 543705 net.go:770] primary dev: ETH0
I0320 00:03:13.423109 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:03:13.423121 543705 net.go:698] Add success.
I0320 00:03:13.927891 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"70438be1-72be-436f-96a1-520a826c44e0","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 00:03:13.927926 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 00:03:14.454584 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:03:14.454742 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:03:14.454807 543705 disk_worker.go:708] disk space is not compliant
W0320 00:03:14.454810 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:03:14.456181 543705 disk_worker.go:494] system disk:vda1
I0320 00:03:14.456236 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:03:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:03:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:03:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:03:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:03:16.472379 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:03:21.613671 543705 disk_info.go:125] begin check local disk info of client
I0320 00:03:21.616171 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:03:21.616177 543705 disk_info.go:196] parse disk info done, disk is : [0xc000370100 0xc000370140]
E0320 00:03:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:03:23.409781 543705 memory.go:184] no items to output this cycle
I0320 00:03:23.409792 543705 cpu.go:275] no items to output this cycle
E0320 00:03:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:03:33.409808 543705 memory.go:184] no items to output this cycle
I0320 00:03:33.409822 543705 cpu.go:275] no items to output this cycle
I0320 00:03:37.972564 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 00:03:37.972570 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 00:03:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:03:43.410561 543705 memory.go:191] Add success.
I0320 00:03:43.409814 543705 cpu.go:282] Add success.
I0320 00:03:43.420262 543705 net.go:648] Add success.
I0320 00:03:43.422750 543705 net.go:770] primary dev: ETH0
I0320 00:03:43.422764 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:03:43.422779 543705 net.go:698] Add success.
I0320 00:03:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:03:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:03:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:03:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:03:53.409777 543705 memory.go:184] no items to output this cycle
I0320 00:03:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 00:04:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:04:03.409774 543705 memory.go:184] no items to output this cycle
I0320 00:04:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 00:04:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:04:13.409815 543705 memory.go:191] Add success.
I0320 00:04:13.409828 543705 cpu.go:282] Add success.
W0320 00:04:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:04:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:04:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:04:13.420153 543705 net.go:648] Add success.
I0320 00:04:13.422860 543705 net.go:770] primary dev: ETH0
I0320 00:04:13.422875 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:04:13.422891 543705 net.go:698] Add success.
I0320 00:04:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:04:14.455136 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:04:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0320 00:04:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:04:14.456621 543705 disk_worker.go:494] system disk:vda1
I0320 00:04:14.456652 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:04:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:04:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:04:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:04:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:04:16.472400 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:04:21.616769 543705 disk_info.go:125] begin check local disk info of client
I0320 00:04:21.619199 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:04:21.619205 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b6300 0xc0002b6340]
E0320 00:04:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:04:23.409784 543705 memory.go:184] no items to output this cycle
I0320 00:04:23.409800 543705 cpu.go:275] no items to output this cycle
E0320 00:04:33.409877 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:04:33.409896 543705 memory.go:184] no items to output this cycle
I0320 00:04:33.409897 543705 cpu.go:275] no items to output this cycle
E0320 00:04:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:04:43.409792 543705 memory.go:191] Add success.
I0320 00:04:43.409809 543705 cpu.go:282] Add success.
I0320 00:04:43.420061 543705 net.go:648] Add success.
I0320 00:04:43.422919 543705 net.go:770] primary dev: ETH0
I0320 00:04:43.422935 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:04:43.422950 543705 net.go:698] Add success.
I0320 00:04:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:04:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:04:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:04:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:04:53.409779 543705 memory.go:184] no items to output this cycle
I0320 00:04:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 00:05:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:05:03.409798 543705 memory.go:184] no items to output this cycle
I0320 00:05:03.409815 543705 cpu.go:275] no items to output this cycle
E0320 00:05:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:05:13.409798 543705 memory.go:191] Add success.
I0320 00:05:13.409799 543705 cpu.go:282] Add success.
W0320 00:05:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:05:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:05:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:05:13.420144 543705 net.go:648] Add success.
I0320 00:05:13.423049 543705 net.go:770] primary dev: ETH0
I0320 00:05:13.423062 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:05:13.423072 543705 net.go:698] Add success.
I0320 00:05:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:05:14.455152 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:05:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0320 00:05:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:05:14.456507 543705 disk_worker.go:494] system disk:vda1
I0320 00:05:14.456549 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:05:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:05:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:05:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:05:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:05:16.472443 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:05:21.619782 543705 disk_info.go:125] begin check local disk info of client
I0320 00:05:21.622215 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:05:21.622222 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5e80 0xc0000c5ec0]
E0320 00:05:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:05:23.409787 543705 memory.go:184] no items to output this cycle
I0320 00:05:23.409800 543705 cpu.go:275] no items to output this cycle
E0320 00:05:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:05:33.409792 543705 memory.go:184] no items to output this cycle
I0320 00:05:33.409807 543705 cpu.go:275] no items to output this cycle
E0320 00:05:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:05:43.409797 543705 cpu.go:282] Add success.
I0320 00:05:43.409808 543705 memory.go:191] Add success.
I0320 00:05:43.419724 543705 net.go:648] Add success.
I0320 00:05:43.422556 543705 net.go:770] primary dev: ETH0
I0320 00:05:43.422570 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:05:43.422585 543705 net.go:698] Add success.
I0320 00:05:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:05:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:05:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:05:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:05:53.409805 543705 memory.go:184] no items to output this cycle
I0320 00:05:53.409814 543705 cpu.go:275] no items to output this cycle
E0320 00:06:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:06:03.409774 543705 memory.go:184] no items to output this cycle
I0320 00:06:03.409778 543705 cpu.go:275] no items to output this cycle
E0320 00:06:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:06:13.409812 543705 memory.go:191] Add success.
I0320 00:06:13.409821 543705 cpu.go:282] Add success.
W0320 00:06:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:06:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:06:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:06:13.420124 543705 net.go:648] Add success.
I0320 00:06:13.423135 543705 net.go:770] primary dev: ETH0
I0320 00:06:13.423149 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:06:13.423161 543705 net.go:698] Add success.
I0320 00:06:13.476819 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8f424be9-cb20-4a37-aa2e-a855656d4ab7","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 00:06:13.476853 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 00:06:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:06:14.455166 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:06:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0320 00:06:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:06:14.456550 543705 disk_worker.go:494] system disk:vda1
I0320 00:06:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:06:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:06:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:06:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:06:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:06:16.472373 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:06:21.622753 543705 disk_info.go:125] begin check local disk info of client
I0320 00:06:21.625158 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:06:21.625164 543705 disk_info.go:196] parse disk info done, disk is : [0xc000366140 0xc000366180]
E0320 00:06:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:06:23.409792 543705 memory.go:184] no items to output this cycle
I0320 00:06:23.409805 543705 cpu.go:275] no items to output this cycle
E0320 00:06:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:06:33.409766 543705 memory.go:184] no items to output this cycle
I0320 00:06:33.409792 543705 cpu.go:275] no items to output this cycle
I0320 00:06:37.973738 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 00:06:37.973745 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 00:06:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:06:43.410754 543705 memory.go:191] Add success.
I0320 00:06:43.409797 543705 cpu.go:282] Add success.
I0320 00:06:43.419712 543705 net.go:648] Add success.
I0320 00:06:43.422582 543705 net.go:770] primary dev: ETH0
I0320 00:06:43.422595 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:06:43.422607 543705 net.go:698] Add success.
I0320 00:06:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:06:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:06:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:06:53.410221 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:06:53.410241 543705 memory.go:184] no items to output this cycle
I0320 00:06:53.410250 543705 cpu.go:275] no items to output this cycle
E0320 00:07:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:07:03.409769 543705 memory.go:184] no items to output this cycle
I0320 00:07:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 00:07:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:07:13.409780 543705 memory.go:191] Add success.
W0320 00:07:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 00:07:13.409810 543705 cpu.go:282] Add success.
W0320 00:07:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:07:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:07:13.420043 543705 net.go:648] Add success.
I0320 00:07:13.422707 543705 net.go:770] primary dev: ETH0
I0320 00:07:13.422720 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:07:13.422731 543705 net.go:698] Add success.
I0320 00:07:13.453302 543705 event_worker.go:152] Polling the log file for events...
W0320 00:07:14.455153 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:07:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0320 00:07:14.455167 543705 disk_worker.go:728] disk inode is not compliant
E0320 00:07:14.456950 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 00:07:14.456960 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 00:07:14.456966 543705 custom_config.go:64] query custom config with name: gpu
I0320 00:07:14.457011 543705 disk_worker.go:494] system disk:vda1
I0320 00:07:14.457037 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 00:07:15.456827 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 00:07:15.456835 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:07:16.457912 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 00:07:16.457921 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 00:07:16.457966 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:07:16.457982 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:07:16.472321 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:07:21.625671 543705 disk_info.go:125] begin check local disk info of client
I0320 00:07:21.628124 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:07:21.628130 543705 disk_info.go:196] parse disk info done, disk is : [0xc000349000 0xc000349040]
E0320 00:07:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:07:23.409789 543705 memory.go:184] no items to output this cycle
I0320 00:07:23.409798 543705 cpu.go:275] no items to output this cycle
E0320 00:07:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:07:33.409804 543705 memory.go:184] no items to output this cycle
I0320 00:07:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 00:07:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:07:43.409786 543705 memory.go:191] Add success.
I0320 00:07:43.409814 543705 cpu.go:282] Add success.
I0320 00:07:43.419995 543705 net.go:648] Add success.
I0320 00:07:43.423049 543705 net.go:770] primary dev: ETH0
I0320 00:07:43.423063 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:07:43.423076 543705 net.go:698] Add success.
I0320 00:07:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:07:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:07:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:07:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:07:53.409767 543705 memory.go:184] no items to output this cycle
I0320 00:07:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 00:08:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:08:03.409779 543705 memory.go:184] no items to output this cycle
I0320 00:08:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 00:08:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:08:13.409775 543705 memory.go:191] Add success.
W0320 00:08:13.409800 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:08:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:08:13.409811 543705 cpu.go:282] Add success.
I0320 00:08:13.409815 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:08:13.420233 543705 net.go:648] Add success.
I0320 00:08:13.423119 543705 net.go:770] primary dev: ETH0
I0320 00:08:13.423136 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:08:13.423149 543705 net.go:698] Add success.
I0320 00:08:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:08:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:08:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 00:08:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:08:14.456510 543705 disk_worker.go:494] system disk:vda1
I0320 00:08:14.456555 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:08:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:08:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:08:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:08:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:08:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:08:21.628839 543705 disk_info.go:125] begin check local disk info of client
I0320 00:08:21.631319 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:08:21.631325 543705 disk_info.go:196] parse disk info done, disk is : [0xc000348800 0xc000348840]
E0320 00:08:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:08:23.409780 543705 memory.go:184] no items to output this cycle
I0320 00:08:23.409781 543705 cpu.go:275] no items to output this cycle
E0320 00:08:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:08:33.409776 543705 memory.go:184] no items to output this cycle
I0320 00:08:33.409781 543705 cpu.go:275] no items to output this cycle
E0320 00:08:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:08:43.409784 543705 memory.go:191] Add success.
I0320 00:08:43.409817 543705 cpu.go:282] Add success.
I0320 00:08:43.419881 543705 net.go:648] Add success.
I0320 00:08:43.422471 543705 net.go:770] primary dev: ETH0
I0320 00:08:43.422486 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:08:43.422502 543705 net.go:698] Add success.
I0320 00:08:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:08:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:08:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:08:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:08:53.409769 543705 memory.go:184] no items to output this cycle
I0320 00:08:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 00:09:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:09:03.409783 543705 memory.go:184] no items to output this cycle
I0320 00:09:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 00:09:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:09:13.409802 543705 memory.go:191] Add success.
I0320 00:09:13.409805 543705 cpu.go:282] Add success.
W0320 00:09:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:09:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:09:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:09:13.420169 543705 net.go:648] Add success.
I0320 00:09:13.423080 543705 net.go:770] primary dev: ETH0
I0320 00:09:13.423094 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:09:13.423108 543705 net.go:698] Add success.
I0320 00:09:13.464884 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8517e4ff-335a-4555-b8a9-8127ca5f6e66","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 00:09:13.464927 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 00:09:14.454982 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:09:14.455188 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:09:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 00:09:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:09:14.456604 543705 disk_worker.go:494] system disk:vda1
I0320 00:09:14.456633 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:09:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:09:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:09:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:09:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:09:16.472374 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:09:21.631790 543705 disk_info.go:125] begin check local disk info of client
I0320 00:09:21.634215 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:09:21.634220 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003491c0 0xc000349200]
E0320 00:09:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:09:23.409787 543705 memory.go:184] no items to output this cycle
I0320 00:09:23.409799 543705 cpu.go:275] no items to output this cycle
E0320 00:09:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:09:33.409779 543705 memory.go:184] no items to output this cycle
I0320 00:09:33.409786 543705 cpu.go:275] no items to output this cycle
I0320 00:09:37.976569 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 00:09:37.976576 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 00:09:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:09:43.410656 543705 memory.go:191] Add success.
I0320 00:09:43.409818 543705 cpu.go:282] Add success.
I0320 00:09:43.420337 543705 net.go:648] Add success.
I0320 00:09:43.422974 543705 net.go:770] primary dev: ETH0
I0320 00:09:43.422990 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:09:43.423007 543705 net.go:698] Add success.
I0320 00:09:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:09:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:09:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:09:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:09:53.409795 543705 memory.go:184] no items to output this cycle
I0320 00:09:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 00:10:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:10:03.409768 543705 memory.go:184] no items to output this cycle
I0320 00:10:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 00:10:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:10:13.409792 543705 memory.go:191] Add success.
W0320 00:10:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 00:10:13.409820 543705 cpu.go:282] Add success.
W0320 00:10:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:10:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:10:13.420247 543705 net.go:648] Add success.
I0320 00:10:13.422991 543705 net.go:770] primary dev: ETH0
I0320 00:10:13.423006 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:10:13.423020 543705 net.go:698] Add success.
I0320 00:10:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:10:14.455191 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:10:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 00:10:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:10:14.456582 543705 disk_worker.go:494] system disk:vda1
I0320 00:10:14.456613 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:10:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:10:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:10:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:10:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:10:16.472392 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:10:21.634305 543705 disk_info.go:125] begin check local disk info of client
I0320 00:10:21.636760 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:10:21.636766 543705 disk_info.go:196] parse disk info done, disk is : [0xc000559480 0xc0005594c0]
E0320 00:10:23.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:10:23.409761 543705 memory.go:184] no items to output this cycle
I0320 00:10:23.409794 543705 cpu.go:275] no items to output this cycle
E0320 00:10:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:10:33.409790 543705 memory.go:184] no items to output this cycle
I0320 00:10:33.409791 543705 cpu.go:275] no items to output this cycle
E0320 00:10:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:10:43.409812 543705 memory.go:191] Add success.
I0320 00:10:43.409815 543705 cpu.go:282] Add success.
I0320 00:10:43.419972 543705 net.go:648] Add success.
I0320 00:10:43.422993 543705 net.go:770] primary dev: ETH0
I0320 00:10:43.423008 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:10:43.423022 543705 net.go:698] Add success.
I0320 00:10:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:10:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:10:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:10:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:10:53.409782 543705 memory.go:184] no items to output this cycle
I0320 00:10:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 00:11:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:11:03.409806 543705 memory.go:184] no items to output this cycle
I0320 00:11:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 00:11:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:11:13.409805 543705 memory.go:191] Add success.
W0320 00:11:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 00:11:13.409832 543705 cpu.go:282] Add success.
W0320 00:11:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:11:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:11:13.420143 543705 net.go:648] Add success.
I0320 00:11:13.423094 543705 net.go:770] primary dev: ETH0
I0320 00:11:13.423108 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:11:13.423122 543705 net.go:698] Add success.
I0320 00:11:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:11:14.455185 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:11:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 00:11:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:11:14.456570 543705 disk_worker.go:494] system disk:vda1
I0320 00:11:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:11:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:11:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:11:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:11:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:11:16.472377 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:11:21.636871 543705 disk_info.go:125] begin check local disk info of client
I0320 00:11:21.639350 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:11:21.639356 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa480 0xc0001aa4c0]
E0320 00:11:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:11:23.409795 543705 memory.go:184] no items to output this cycle
I0320 00:11:23.409806 543705 cpu.go:275] no items to output this cycle
E0320 00:11:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:11:33.409789 543705 memory.go:184] no items to output this cycle
I0320 00:11:33.409791 543705 cpu.go:275] no items to output this cycle
E0320 00:11:43.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:11:43.409829 543705 memory.go:191] Add success.
I0320 00:11:43.409835 543705 cpu.go:282] Add success.
I0320 00:11:43.419998 543705 net.go:648] Add success.
I0320 00:11:43.422648 543705 net.go:770] primary dev: ETH0
I0320 00:11:43.422663 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:11:43.422678 543705 net.go:698] Add success.
I0320 00:11:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:11:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:11:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:11:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:11:53.409809 543705 memory.go:184] no items to output this cycle
I0320 00:11:53.409821 543705 cpu.go:275] no items to output this cycle
E0320 00:12:03.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:12:03.409763 543705 memory.go:184] no items to output this cycle
I0320 00:12:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 00:12:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:12:13.409822 543705 memory.go:191] Add success.
I0320 00:12:13.409828 543705 cpu.go:282] Add success.
W0320 00:12:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:12:13.409878 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:12:13.409882 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:12:13.420148 543705 net.go:648] Add success.
I0320 00:12:13.423087 543705 net.go:770] primary dev: ETH0
I0320 00:12:13.423100 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:12:13.423112 543705 net.go:698] Add success.
I0320 00:12:13.470178 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7e0d99ad-f583-4a00-9216-009e1d9623be","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 00:12:13.470216 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 00:12:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:12:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 00:12:14.455187 543705 disk_worker.go:728] disk inode is not compliant
E0320 00:12:14.455933 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 00:12:14.455942 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 00:12:14.455947 543705 custom_config.go:64] query custom config with name: gpu
I0320 00:12:14.456572 543705 disk_worker.go:494] system disk:vda1
I0320 00:12:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 00:12:15.456868 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 00:12:15.456877 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:12:16.457954 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 00:12:16.457962 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 00:12:16.458008 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:12:16.458027 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:12:16.472342 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:12:21.639839 543705 disk_info.go:125] begin check local disk info of client
I0320 00:12:21.642307 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:12:21.642314 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004e8c80 0xc0004e8cc0]
E0320 00:12:23.409741 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:12:23.409754 543705 memory.go:184] no items to output this cycle
I0320 00:12:23.409794 543705 cpu.go:275] no items to output this cycle
E0320 00:12:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:12:33.409782 543705 memory.go:184] no items to output this cycle
I0320 00:12:33.409784 543705 cpu.go:275] no items to output this cycle
I0320 00:12:37.977756 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 00:12:37.977762 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 00:12:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:12:43.410729 543705 memory.go:191] Add success.
I0320 00:12:43.409827 543705 cpu.go:282] Add success.
I0320 00:12:43.420519 543705 net.go:648] Add success.
I0320 00:12:43.423308 543705 net.go:770] primary dev: ETH0
I0320 00:12:43.423321 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:12:43.423336 543705 net.go:698] Add success.
I0320 00:12:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:12:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:12:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:12:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:12:53.409799 543705 memory.go:184] no items to output this cycle
I0320 00:12:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 00:13:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:13:03.409891 543705 memory.go:184] no items to output this cycle
I0320 00:13:03.409977 543705 cpu.go:275] no items to output this cycle
W0320 00:13:13.409715 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:13:13.409733 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:13:13.409739 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:13:13.409810 543705 cpu.go:282] Add success.
E0320 00:13:13.409836 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:13:13.409860 543705 memory.go:191] Add success.
I0320 00:13:13.420059 543705 net.go:648] Add success.
I0320 00:13:13.422769 543705 net.go:770] primary dev: ETH0
I0320 00:13:13.422782 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:13:13.422794 543705 net.go:698] Add success.
I0320 00:13:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:13:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:13:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0320 00:13:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:13:14.456575 543705 disk_worker.go:494] system disk:vda1
I0320 00:13:14.456606 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:13:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:13:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:13:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:13:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:13:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:13:21.642911 543705 disk_info.go:125] begin check local disk info of client
I0320 00:13:21.645432 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:13:21.645439 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003844c0 0xc000384500]
E0320 00:13:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:13:23.409778 543705 cpu.go:275] no items to output this cycle
I0320 00:13:23.409787 543705 memory.go:184] no items to output this cycle
E0320 00:13:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:13:33.409801 543705 memory.go:184] no items to output this cycle
I0320 00:13:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 00:13:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:13:43.409800 543705 cpu.go:282] Add success.
I0320 00:13:43.409810 543705 memory.go:191] Add success.
I0320 00:13:43.420142 543705 net.go:648] Add success.
I0320 00:13:43.422754 543705 net.go:770] primary dev: ETH0
I0320 00:13:43.422767 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:13:43.422781 543705 net.go:698] Add success.
I0320 00:13:46.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:13:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:13:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:13:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:13:53.409799 543705 memory.go:184] no items to output this cycle
I0320 00:13:53.409814 543705 cpu.go:275] no items to output this cycle
E0320 00:14:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:14:03.409782 543705 memory.go:184] no items to output this cycle
I0320 00:14:03.409787 543705 cpu.go:275] no items to output this cycle
W0320 00:14:13.409719 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:14:13.409743 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:14:13.409749 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:14:13.409840 543705 cpu.go:282] Add success.
E0320 00:14:13.409843 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:14:13.409860 543705 memory.go:191] Add success.
I0320 00:14:13.420154 543705 net.go:648] Add success.
I0320 00:14:13.423004 543705 net.go:770] primary dev: ETH0
I0320 00:14:13.423030 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:14:13.423043 543705 net.go:698] Add success.
I0320 00:14:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:14:14.455183 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:14:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 00:14:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:14:14.456577 543705 disk_worker.go:494] system disk:vda1
I0320 00:14:14.456606 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:14:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:14:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:14:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:14:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:14:16.472442 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:14:21.645671 543705 disk_info.go:125] begin check local disk info of client
I0320 00:14:21.648105 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:14:21.648110 543705 disk_info.go:196] parse disk info done, disk is : [0xc000483f00 0xc000483f40]
E0320 00:14:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:14:23.409783 543705 memory.go:184] no items to output this cycle
I0320 00:14:23.409797 543705 cpu.go:275] no items to output this cycle
E0320 00:14:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:14:33.409768 543705 memory.go:184] no items to output this cycle
I0320 00:14:33.409789 543705 cpu.go:275] no items to output this cycle
E0320 00:14:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:14:43.409794 543705 memory.go:191] Add success.
I0320 00:14:43.409812 543705 cpu.go:282] Add success.
I0320 00:14:43.420044 543705 net.go:648] Add success.
I0320 00:14:43.422867 543705 net.go:770] primary dev: ETH0
I0320 00:14:43.422883 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:14:43.422897 543705 net.go:698] Add success.
I0320 00:14:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:14:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:14:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:14:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:14:53.409791 543705 memory.go:184] no items to output this cycle
I0320 00:14:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 00:15:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:15:03.409770 543705 memory.go:184] no items to output this cycle
I0320 00:15:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 00:15:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:15:13.409795 543705 memory.go:191] Add success.
I0320 00:15:13.409798 543705 cpu.go:282] Add success.
W0320 00:15:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:15:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:15:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:15:13.419709 543705 net.go:648] Add success.
I0320 00:15:13.422255 543705 net.go:770] primary dev: ETH0
I0320 00:15:13.422267 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:15:13.422278 543705 net.go:698] Add success.
I0320 00:15:14.301342 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0a4d3c0e-190e-4104-b10c-fc913246b372","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 00:15:14.301375 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 00:15:14.453974 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:15:14.454168 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:15:14.454245 543705 disk_worker.go:708] disk space is not compliant
W0320 00:15:14.454248 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:15:14.455815 543705 disk_worker.go:494] system disk:vda1
I0320 00:15:14.455844 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:15:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:15:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:15:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:15:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:15:16.472392 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:15:21.648882 543705 disk_info.go:125] begin check local disk info of client
I0320 00:15:21.651359 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:15:21.651365 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034c140 0xc00034c180]
E0320 00:15:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:15:23.409764 543705 memory.go:184] no items to output this cycle
I0320 00:15:23.409793 543705 cpu.go:275] no items to output this cycle
E0320 00:15:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:15:33.409776 543705 memory.go:184] no items to output this cycle
I0320 00:15:33.409781 543705 cpu.go:275] no items to output this cycle
I0320 00:15:37.980582 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 00:15:37.980588 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 00:15:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:15:43.410700 543705 memory.go:191] Add success.
I0320 00:15:43.409810 543705 cpu.go:282] Add success.
I0320 00:15:43.420436 543705 net.go:648] Add success.
I0320 00:15:43.422950 543705 net.go:770] primary dev: ETH0
I0320 00:15:43.422964 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:15:43.422977 543705 net.go:698] Add success.
I0320 00:15:46.458003 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:15:46.458082 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:15:46.458116 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:15:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:15:53.409795 543705 memory.go:184] no items to output this cycle
I0320 00:15:53.409858 543705 cpu.go:275] no items to output this cycle
E0320 00:16:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:16:03.409790 543705 memory.go:184] no items to output this cycle
I0320 00:16:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 00:16:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:16:13.409855 543705 memory.go:191] Add success.
W0320 00:16:13.409887 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:16:13.409901 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:16:13.409903 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:16:13.409958 543705 cpu.go:282] Add success.
I0320 00:16:13.419717 543705 net.go:648] Add success.
I0320 00:16:13.422414 543705 net.go:770] primary dev: ETH0
I0320 00:16:13.422427 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:16:13.422440 543705 net.go:698] Add success.
I0320 00:16:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:16:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:16:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 00:16:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:16:14.456553 543705 disk_worker.go:494] system disk:vda1
I0320 00:16:14.456584 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:16:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:16:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:16:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:16:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:16:16.472380 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:16:21.651448 543705 disk_info.go:125] begin check local disk info of client
I0320 00:16:21.653922 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:16:21.653929 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ad800 0xc0003ad840]
E0320 00:16:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:16:23.409767 543705 cpu.go:275] no items to output this cycle
I0320 00:16:23.409780 543705 memory.go:184] no items to output this cycle
E0320 00:16:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:16:33.409796 543705 memory.go:184] no items to output this cycle
I0320 00:16:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 00:16:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:16:43.409802 543705 memory.go:191] Add success.
I0320 00:16:43.409804 543705 cpu.go:282] Add success.
I0320 00:16:43.420035 543705 net.go:648] Add success.
I0320 00:16:43.422735 543705 net.go:770] primary dev: ETH0
I0320 00:16:43.422754 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:16:43.422769 543705 net.go:698] Add success.
I0320 00:16:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:16:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:16:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:16:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:16:53.409776 543705 memory.go:184] no items to output this cycle
I0320 00:16:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 00:17:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:17:03.409768 543705 memory.go:184] no items to output this cycle
I0320 00:17:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 00:17:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:17:13.409832 543705 memory.go:191] Add success.
I0320 00:17:13.409837 543705 cpu.go:282] Add success.
W0320 00:17:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:17:13.409882 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:17:13.409885 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:17:13.420163 543705 net.go:648] Add success.
I0320 00:17:13.422890 543705 net.go:770] primary dev: ETH0
I0320 00:17:13.422903 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:17:13.422915 543705 net.go:698] Add success.
I0320 00:17:13.453454 543705 event_worker.go:152] Polling the log file for events...
W0320 00:17:14.455104 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:17:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0320 00:17:14.455164 543705 disk_worker.go:728] disk inode is not compliant
E0320 00:17:14.456912 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 00:17:14.456921 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 00:17:14.456927 543705 custom_config.go:64] query custom config with name: gpu
I0320 00:17:14.457002 543705 disk_worker.go:494] system disk:vda1
I0320 00:17:14.457044 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 00:17:15.456816 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 00:17:15.456824 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:17:16.457959 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 00:17:16.457958 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 00:17:16.458012 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:17:16.458033 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:17:16.472412 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:17:21.654735 543705 disk_info.go:125] begin check local disk info of client
I0320 00:17:21.657097 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:17:21.657104 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034c3c0 0xc00034c400]
E0320 00:17:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:17:23.409765 543705 memory.go:184] no items to output this cycle
I0320 00:17:23.409783 543705 cpu.go:275] no items to output this cycle
E0320 00:17:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:17:33.409795 543705 memory.go:184] no items to output this cycle
I0320 00:17:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 00:17:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:17:43.409801 543705 memory.go:191] Add success.
I0320 00:17:43.409802 543705 cpu.go:282] Add success.
I0320 00:17:43.419855 543705 net.go:648] Add success.
I0320 00:17:43.422615 543705 net.go:770] primary dev: ETH0
I0320 00:17:43.422628 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:17:43.422641 543705 net.go:698] Add success.
I0320 00:17:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:17:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:17:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:17:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:17:53.409780 543705 memory.go:184] no items to output this cycle
I0320 00:17:53.409784 543705 cpu.go:275] no items to output this cycle
E0320 00:18:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:18:03.409775 543705 memory.go:184] no items to output this cycle
I0320 00:18:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 00:18:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:18:13.409816 543705 memory.go:191] Add success.
I0320 00:18:13.409820 543705 cpu.go:282] Add success.
W0320 00:18:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:18:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:18:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:18:13.419709 543705 net.go:648] Add success.
I0320 00:18:13.422613 543705 net.go:770] primary dev: ETH0
I0320 00:18:13.422629 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:18:13.422643 543705 net.go:698] Add success.
I0320 00:18:13.908202 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8ae1b429-cc3e-42bb-9f8c-60912de3e8f3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 00:18:13.908241 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 00:18:14.454680 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:18:14.454804 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:18:14.454864 543705 disk_worker.go:708] disk space is not compliant
W0320 00:18:14.454867 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:18:14.456217 543705 disk_worker.go:494] system disk:vda1
I0320 00:18:14.456261 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:18:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:18:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:18:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:18:16.458049 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:18:16.472368 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:18:21.657672 543705 disk_info.go:125] begin check local disk info of client
I0320 00:18:21.660112 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:18:21.660118 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4280 0xc0000c42c0]
E0320 00:18:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:18:23.409767 543705 memory.go:184] no items to output this cycle
I0320 00:18:23.409796 543705 cpu.go:275] no items to output this cycle
E0320 00:18:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:18:33.409781 543705 memory.go:184] no items to output this cycle
I0320 00:18:33.409786 543705 cpu.go:275] no items to output this cycle
I0320 00:18:37.981743 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 00:18:37.981750 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 00:18:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:18:43.410712 543705 memory.go:191] Add success.
I0320 00:18:43.409815 543705 cpu.go:282] Add success.
I0320 00:18:43.420443 543705 net.go:648] Add success.
I0320 00:18:43.423808 543705 net.go:770] primary dev: ETH0
I0320 00:18:43.423820 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:18:43.423832 543705 net.go:698] Add success.
I0320 00:18:46.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:18:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:18:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:18:53.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:18:53.409808 543705 memory.go:184] no items to output this cycle
I0320 00:18:53.409825 543705 cpu.go:275] no items to output this cycle
E0320 00:19:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:19:03.409788 543705 memory.go:184] no items to output this cycle
I0320 00:19:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 00:19:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:19:13.409802 543705 memory.go:191] Add success.
I0320 00:19:13.409823 543705 cpu.go:282] Add success.
W0320 00:19:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:19:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:19:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:19:13.419747 543705 net.go:648] Add success.
I0320 00:19:13.422718 543705 net.go:770] primary dev: ETH0
I0320 00:19:13.422730 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:19:13.422742 543705 net.go:698] Add success.
I0320 00:19:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:19:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:19:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 00:19:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:19:14.456743 543705 disk_worker.go:494] system disk:vda1
I0320 00:19:14.456770 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:19:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:19:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:19:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:19:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:19:16.472373 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:19:21.660935 543705 disk_info.go:125] begin check local disk info of client
I0320 00:19:21.663300 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:19:21.663306 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003cf7c0 0xc0003cf800]
E0320 00:19:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:19:23.409786 543705 memory.go:184] no items to output this cycle
I0320 00:19:23.409809 543705 cpu.go:275] no items to output this cycle
E0320 00:19:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:19:33.409799 543705 memory.go:184] no items to output this cycle
I0320 00:19:33.409820 543705 cpu.go:275] no items to output this cycle
E0320 00:19:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:19:43.409797 543705 memory.go:191] Add success.
I0320 00:19:43.409800 543705 cpu.go:282] Add success.
I0320 00:19:43.420067 543705 net.go:648] Add success.
I0320 00:19:43.422910 543705 net.go:770] primary dev: ETH0
I0320 00:19:43.422923 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:19:43.422935 543705 net.go:698] Add success.
I0320 00:19:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:19:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:19:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:19:53.410352 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:19:53.410367 543705 memory.go:184] no items to output this cycle
I0320 00:19:53.410394 543705 cpu.go:275] no items to output this cycle
E0320 00:20:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:20:03.409770 543705 memory.go:184] no items to output this cycle
I0320 00:20:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 00:20:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:20:13.409813 543705 memory.go:191] Add success.
I0320 00:20:13.409826 543705 cpu.go:282] Add success.
W0320 00:20:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:20:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:20:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:20:13.420414 543705 net.go:648] Add success.
I0320 00:20:13.423218 543705 net.go:770] primary dev: ETH0
I0320 00:20:13.423231 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:20:13.423243 543705 net.go:698] Add success.
I0320 00:20:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:20:14.455111 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:20:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 00:20:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:20:14.456565 543705 disk_worker.go:494] system disk:vda1
I0320 00:20:14.456593 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:20:15.455976 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:20:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:20:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:20:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:20:16.472373 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:20:21.663389 543705 disk_info.go:125] begin check local disk info of client
I0320 00:20:21.665868 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:20:21.665874 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004e8340 0xc0004e8380]
E0320 00:20:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:20:23.409758 543705 memory.go:184] no items to output this cycle
I0320 00:20:23.409792 543705 cpu.go:275] no items to output this cycle
E0320 00:20:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:20:33.409773 543705 memory.go:184] no items to output this cycle
I0320 00:20:33.409793 543705 cpu.go:275] no items to output this cycle
E0320 00:20:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:20:43.409791 543705 memory.go:191] Add success.
I0320 00:20:43.409814 543705 cpu.go:282] Add success.
I0320 00:20:43.419910 543705 net.go:648] Add success.
I0320 00:20:43.422522 543705 net.go:770] primary dev: ETH0
I0320 00:20:43.422536 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:20:43.422551 543705 net.go:698] Add success.
I0320 00:20:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:20:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:20:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:20:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:20:53.409778 543705 memory.go:184] no items to output this cycle
I0320 00:20:53.409779 543705 cpu.go:275] no items to output this cycle
E0320 00:21:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:21:03.409767 543705 memory.go:184] no items to output this cycle
I0320 00:21:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 00:21:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:21:13.409826 543705 memory.go:191] Add success.
I0320 00:21:13.409833 543705 cpu.go:282] Add success.
W0320 00:21:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:21:13.409878 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:21:13.409882 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:21:13.420170 543705 net.go:648] Add success.
I0320 00:21:13.422696 543705 net.go:770] primary dev: ETH0
I0320 00:21:13.422712 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:21:13.422726 543705 net.go:698] Add success.
I0320 00:21:13.469622 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"70f58bbf-eb92-45ec-89ea-c54a1ae95176","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 00:21:13.469667 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 00:21:14.454978 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:21:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:21:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0320 00:21:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:21:14.456681 543705 disk_worker.go:494] system disk:vda1
I0320 00:21:14.456709 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:21:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:21:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:21:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:21:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:21:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:21:21.665958 543705 disk_info.go:125] begin check local disk info of client
I0320 00:21:21.668463 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:21:21.668470 543705 disk_info.go:196] parse disk info done, disk is : [0xc00053e1c0 0xc00053e200]
E0320 00:21:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:21:23.409769 543705 cpu.go:275] no items to output this cycle
I0320 00:21:23.409776 543705 memory.go:184] no items to output this cycle
E0320 00:21:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:21:33.409793 543705 memory.go:184] no items to output this cycle
I0320 00:21:33.409809 543705 cpu.go:275] no items to output this cycle
I0320 00:21:37.984610 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 00:21:37.984618 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 00:21:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:21:43.410665 543705 memory.go:191] Add success.
I0320 00:21:43.409830 543705 cpu.go:282] Add success.
I0320 00:21:43.420398 543705 net.go:648] Add success.
I0320 00:21:43.423180 543705 net.go:770] primary dev: ETH0
I0320 00:21:43.423194 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:21:43.423209 543705 net.go:698] Add success.
I0320 00:21:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:21:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:21:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:21:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:21:53.409777 543705 cpu.go:275] no items to output this cycle
I0320 00:21:53.409779 543705 memory.go:184] no items to output this cycle
E0320 00:22:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:22:03.409797 543705 memory.go:184] no items to output this cycle
I0320 00:22:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 00:22:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:22:13.409815 543705 memory.go:191] Add success.
I0320 00:22:13.409823 543705 cpu.go:282] Add success.
W0320 00:22:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:22:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:22:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:22:13.420051 543705 net.go:648] Add success.
I0320 00:22:13.422661 543705 net.go:770] primary dev: ETH0
I0320 00:22:13.422675 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:22:13.422687 543705 net.go:698] Add success.
W0320 00:22:14.455601 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:22:14.455615 543705 disk_worker.go:708] disk space is not compliant
W0320 00:22:14.455620 543705 disk_worker.go:728] disk inode is not compliant
E0320 00:22:14.456240 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 00:22:14.456250 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 00:22:14.456257 543705 custom_config.go:64] query custom config with name: gpu
I0320 00:22:14.457402 543705 disk_worker.go:494] system disk:vda1
I0320 00:22:14.457432 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 00:22:15.456812 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 00:22:15.456821 543705 custom_config.go:64] query custom config with name: huawei_npu
E0320 00:22:16.457927 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 00:22:16.457936 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:22:16.457989 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:22:16.458010 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:22:16.472348 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:22:21.668992 543705 disk_info.go:125] begin check local disk info of client
I0320 00:22:21.671741 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:22:21.671747 543705 disk_info.go:196] parse disk info done, disk is : [0xc000530080 0xc0005300c0]
E0320 00:22:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:22:23.409791 543705 memory.go:184] no items to output this cycle
I0320 00:22:23.409810 543705 cpu.go:275] no items to output this cycle
E0320 00:22:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:22:33.409791 543705 cpu.go:275] no items to output this cycle
I0320 00:22:33.409794 543705 memory.go:184] no items to output this cycle
E0320 00:22:43.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:22:43.409831 543705 memory.go:191] Add success.
I0320 00:22:43.409837 543705 cpu.go:282] Add success.
I0320 00:22:43.419992 543705 net.go:648] Add success.
I0320 00:22:43.422879 543705 net.go:770] primary dev: ETH0
I0320 00:22:43.422892 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:22:43.422905 543705 net.go:698] Add success.
I0320 00:22:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:22:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:22:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:22:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:22:53.409768 543705 memory.go:184] no items to output this cycle
I0320 00:22:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 00:23:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:23:03.409777 543705 memory.go:184] no items to output this cycle
I0320 00:23:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 00:23:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:23:13.409798 543705 memory.go:191] Add success.
I0320 00:23:13.409798 543705 cpu.go:282] Add success.
W0320 00:23:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:23:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:23:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:23:13.420267 543705 net.go:648] Add success.
I0320 00:23:13.423023 543705 net.go:770] primary dev: ETH0
I0320 00:23:13.423037 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:23:13.423049 543705 net.go:698] Add success.
I0320 00:23:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:23:14.455145 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:23:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 00:23:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:23:14.456489 543705 disk_worker.go:494] system disk:vda1
I0320 00:23:14.456530 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:23:15.456013 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:23:16.457993 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:23:16.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:23:16.458085 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:23:16.472423 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:23:21.673001 543705 disk_info.go:125] begin check local disk info of client
I0320 00:23:21.675496 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:23:21.675502 543705 disk_info.go:196] parse disk info done, disk is : [0xc000344480 0xc0003444c0]
E0320 00:23:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:23:23.409772 543705 memory.go:184] no items to output this cycle
I0320 00:23:23.409791 543705 cpu.go:275] no items to output this cycle
E0320 00:23:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:23:33.409778 543705 memory.go:184] no items to output this cycle
I0320 00:23:33.409798 543705 cpu.go:275] no items to output this cycle
E0320 00:23:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:23:43.409791 543705 memory.go:191] Add success.
I0320 00:23:43.409811 543705 cpu.go:282] Add success.
I0320 00:23:43.419888 543705 net.go:648] Add success.
I0320 00:23:43.422771 543705 net.go:770] primary dev: ETH0
I0320 00:23:43.422784 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:23:43.422798 543705 net.go:698] Add success.
I0320 00:23:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:23:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:23:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:23:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:23:53.409795 543705 memory.go:184] no items to output this cycle
I0320 00:23:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 00:24:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:24:03.409798 543705 memory.go:184] no items to output this cycle
I0320 00:24:03.409809 543705 cpu.go:275] no items to output this cycle
E0320 00:24:13.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:24:13.409779 543705 memory.go:191] Add success.
I0320 00:24:13.409805 543705 cpu.go:282] Add success.
W0320 00:24:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:24:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:24:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:24:13.420090 543705 net.go:648] Add success.
I0320 00:24:13.423466 543705 net.go:770] primary dev: ETH0
I0320 00:24:13.423479 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:24:13.423490 543705 net.go:698] Add success.
I0320 00:24:13.530384 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"180b2ee0-f841-4b1f-a103-11a675fcfe29","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 00:24:13.530417 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 00:24:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:24:14.455202 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:24:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0320 00:24:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:24:14.456621 543705 disk_worker.go:494] system disk:vda1
I0320 00:24:14.456650 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:24:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:24:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:24:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:24:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:24:16.472381 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:24:21.675578 543705 disk_info.go:125] begin check local disk info of client
I0320 00:24:21.677991 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:24:21.677998 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5b40 0xc0000c5b80]
E0320 00:24:23.409867 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:24:23.409912 543705 memory.go:184] no items to output this cycle
I0320 00:24:23.410012 543705 cpu.go:275] no items to output this cycle
E0320 00:24:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:24:33.409775 543705 memory.go:184] no items to output this cycle
I0320 00:24:33.409812 543705 cpu.go:275] no items to output this cycle
I0320 00:24:37.985744 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 00:24:37.985750 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 00:24:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:24:43.410702 543705 memory.go:191] Add success.
I0320 00:24:43.409821 543705 cpu.go:282] Add success.
I0320 00:24:43.420428 543705 net.go:648] Add success.
I0320 00:24:43.423250 543705 net.go:770] primary dev: ETH0
I0320 00:24:43.423266 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:24:43.423281 543705 net.go:698] Add success.
I0320 00:24:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:24:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:24:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:24:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:24:53.409766 543705 memory.go:184] no items to output this cycle
I0320 00:24:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 00:25:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:25:03.409769 543705 memory.go:184] no items to output this cycle
I0320 00:25:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 00:25:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:25:13.409818 543705 memory.go:191] Add success.
I0320 00:25:13.409821 543705 cpu.go:282] Add success.
W0320 00:25:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:25:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:25:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:25:13.420708 543705 net.go:648] Add success.
I0320 00:25:13.423416 543705 net.go:770] primary dev: ETH0
I0320 00:25:13.423430 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:25:13.423442 543705 net.go:698] Add success.
I0320 00:25:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:25:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:25:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 00:25:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:25:14.456593 543705 disk_worker.go:494] system disk:vda1
I0320 00:25:14.456622 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:25:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:25:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:25:16.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:25:16.458084 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:25:16.472414 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:25:21.679033 543705 disk_info.go:125] begin check local disk info of client
I0320 00:25:21.681449 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:25:21.681456 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab080 0xc0001ab0c0]
I0320 00:25:23.409870 543705 cpu.go:275] no items to output this cycle
E0320 00:25:23.409941 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:25:23.409953 543705 memory.go:184] no items to output this cycle
E0320 00:25:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:25:33.409802 543705 memory.go:184] no items to output this cycle
I0320 00:25:33.409820 543705 cpu.go:275] no items to output this cycle
E0320 00:25:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:25:43.409794 543705 memory.go:191] Add success.
I0320 00:25:43.409812 543705 cpu.go:282] Add success.
I0320 00:25:43.420069 543705 net.go:648] Add success.
I0320 00:25:43.422693 543705 net.go:770] primary dev: ETH0
I0320 00:25:43.422707 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:25:43.422720 543705 net.go:698] Add success.
I0320 00:25:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:25:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:25:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:25:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:25:53.409766 543705 memory.go:184] no items to output this cycle
I0320 00:25:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 00:26:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:26:03.409769 543705 memory.go:184] no items to output this cycle
I0320 00:26:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 00:26:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:26:13.409791 543705 memory.go:191] Add success.
I0320 00:26:13.409794 543705 cpu.go:282] Add success.
W0320 00:26:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:26:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:26:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:26:13.420309 543705 net.go:648] Add success.
I0320 00:26:13.422983 543705 net.go:770] primary dev: ETH0
I0320 00:26:13.422996 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:26:13.423009 543705 net.go:698] Add success.
I0320 00:26:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:26:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:26:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0320 00:26:14.455176 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:26:14.456526 543705 disk_worker.go:494] system disk:vda1
I0320 00:26:14.456569 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:26:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:26:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:26:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:26:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:26:16.472368 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:26:21.681669 543705 disk_info.go:125] begin check local disk info of client
I0320 00:26:21.684087 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:26:21.684094 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002742c0 0xc000274300]
E0320 00:26:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:26:23.409785 543705 memory.go:184] no items to output this cycle
I0320 00:26:23.409801 543705 cpu.go:275] no items to output this cycle
E0320 00:26:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:26:33.409803 543705 memory.go:184] no items to output this cycle
I0320 00:26:33.409814 543705 cpu.go:275] no items to output this cycle
E0320 00:26:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:26:43.409808 543705 memory.go:191] Add success.
I0320 00:26:43.409810 543705 cpu.go:282] Add success.
I0320 00:26:43.420014 543705 net.go:648] Add success.
I0320 00:26:43.422619 543705 net.go:770] primary dev: ETH0
I0320 00:26:43.422634 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:26:43.422649 543705 net.go:698] Add success.
I0320 00:26:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:26:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:26:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:26:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:26:53.409766 543705 memory.go:184] no items to output this cycle
I0320 00:26:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 00:27:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:27:03.409778 543705 memory.go:184] no items to output this cycle
I0320 00:27:03.409779 543705 cpu.go:275] no items to output this cycle
E0320 00:27:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:27:13.409788 543705 memory.go:191] Add success.
I0320 00:27:13.409808 543705 cpu.go:282] Add success.
W0320 00:27:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:27:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:27:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:27:13.420178 543705 net.go:648] Add success.
I0320 00:27:13.429145 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 00:27:13.429224 543705 net.go:770] primary dev: ETH0
I0320 00:27:13.429236 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:27:13.429250 543705 net.go:698] Add success.
I0320 00:27:13.452777 543705 event_worker.go:152] Polling the log file for events...
I0320 00:27:13.624418 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f4263992-4966-4763-a7ca-118d6b1e7bbc","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 00:27:13.624452 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 00:27:14.454158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:27:14.454219 543705 disk_worker.go:708] disk space is not compliant
W0320 00:27:14.454222 543705 disk_worker.go:728] disk inode is not compliant
E0320 00:27:14.456040 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 00:27:14.456047 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 00:27:14.456051 543705 custom_config.go:64] query custom config with name: gpu
I0320 00:27:14.456060 543705 disk_worker.go:494] system disk:vda1
I0320 00:27:14.456115 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 00:27:15.456825 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 00:27:15.456833 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:27:16.457960 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 00:27:16.457968 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 00:27:16.458013 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:27:16.458028 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:27:16.472333 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:27:21.685062 543705 disk_info.go:125] begin check local disk info of client
I0320 00:27:21.687525 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:27:21.687532 543705 disk_info.go:196] parse disk info done, disk is : [0xc000278540 0xc000278580]
E0320 00:27:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:27:23.409775 543705 cpu.go:275] no items to output this cycle
I0320 00:27:23.409777 543705 memory.go:184] no items to output this cycle
E0320 00:27:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:27:33.409801 543705 memory.go:184] no items to output this cycle
I0320 00:27:33.409813 543705 cpu.go:275] no items to output this cycle
I0320 00:27:37.988641 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 00:27:37.988648 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 00:27:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:27:43.410716 543705 memory.go:191] Add success.
I0320 00:27:43.409830 543705 cpu.go:282] Add success.
I0320 00:27:43.420410 543705 net.go:648] Add success.
I0320 00:27:43.423438 543705 net.go:770] primary dev: ETH0
I0320 00:27:43.423453 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:27:43.423468 543705 net.go:698] Add success.
I0320 00:27:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:27:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:27:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:27:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:27:53.409780 543705 memory.go:184] no items to output this cycle
I0320 00:27:53.409783 543705 cpu.go:275] no items to output this cycle
E0320 00:28:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:28:03.409776 543705 memory.go:184] no items to output this cycle
I0320 00:28:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 00:28:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:28:13.409797 543705 memory.go:191] Add success.
I0320 00:28:13.409798 543705 cpu.go:282] Add success.
W0320 00:28:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:28:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:28:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:28:13.420219 543705 net.go:648] Add success.
I0320 00:28:13.423148 543705 net.go:770] primary dev: ETH0
I0320 00:28:13.423163 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:28:13.423178 543705 net.go:698] Add success.
I0320 00:28:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:28:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:28:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0320 00:28:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:28:14.456596 543705 disk_worker.go:494] system disk:vda1
I0320 00:28:14.456633 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:28:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:28:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:28:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:28:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:28:16.472350 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:28:21.688129 543705 disk_info.go:125] begin check local disk info of client
I0320 00:28:21.690577 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:28:21.690592 543705 disk_info.go:196] parse disk info done, disk is : [0xc000275180 0xc0002751c0]
E0320 00:28:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:28:23.409798 543705 memory.go:184] no items to output this cycle
I0320 00:28:23.409808 543705 cpu.go:275] no items to output this cycle
E0320 00:28:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:28:33.409774 543705 memory.go:184] no items to output this cycle
I0320 00:28:33.409799 543705 cpu.go:275] no items to output this cycle
E0320 00:28:43.409905 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:28:43.410015 543705 memory.go:191] Add success.
I0320 00:28:43.410041 543705 cpu.go:282] Add success.
I0320 00:28:43.419732 543705 net.go:648] Add success.
I0320 00:28:43.422475 543705 net.go:770] primary dev: ETH0
I0320 00:28:43.422491 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:28:43.422505 543705 net.go:698] Add success.
I0320 00:28:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:28:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:28:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:28:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:28:53.409772 543705 memory.go:184] no items to output this cycle
I0320 00:28:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 00:29:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:29:03.409779 543705 memory.go:184] no items to output this cycle
I0320 00:29:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 00:29:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:29:13.409823 543705 memory.go:191] Add success.
I0320 00:29:13.409832 543705 cpu.go:282] Add success.
W0320 00:29:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:29:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:29:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:29:13.420382 543705 net.go:648] Add success.
I0320 00:29:13.423067 543705 net.go:770] primary dev: ETH0
I0320 00:29:13.423083 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:29:13.423097 543705 net.go:698] Add success.
I0320 00:29:14.454985 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:29:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:29:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0320 00:29:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:29:14.456588 543705 disk_worker.go:494] system disk:vda1
I0320 00:29:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:29:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:29:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:29:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:29:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:29:16.472358 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:29:21.691102 543705 disk_info.go:125] begin check local disk info of client
I0320 00:29:21.693531 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:29:21.693537 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039ab80 0xc00039ac00]
E0320 00:29:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:29:23.409773 543705 memory.go:184] no items to output this cycle
I0320 00:29:23.409784 543705 cpu.go:275] no items to output this cycle
E0320 00:29:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:29:33.409786 543705 memory.go:184] no items to output this cycle
I0320 00:29:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 00:29:43.409804 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:29:43.409844 543705 memory.go:191] Add success.
I0320 00:29:43.409850 543705 cpu.go:282] Add success.
I0320 00:29:43.420069 543705 net.go:648] Add success.
I0320 00:29:43.422748 543705 net.go:770] primary dev: ETH0
I0320 00:29:43.422763 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:29:43.422778 543705 net.go:698] Add success.
I0320 00:29:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:29:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:29:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:29:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:29:53.409789 543705 memory.go:184] no items to output this cycle
I0320 00:29:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 00:30:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:30:03.409778 543705 memory.go:184] no items to output this cycle
I0320 00:30:03.409809 543705 cpu.go:275] no items to output this cycle
E0320 00:30:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:30:13.409798 543705 memory.go:191] Add success.
I0320 00:30:13.409799 543705 cpu.go:282] Add success.
W0320 00:30:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:30:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:30:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:30:13.420200 543705 net.go:648] Add success.
I0320 00:30:13.422865 543705 net.go:770] primary dev: ETH0
I0320 00:30:13.422878 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:30:13.422890 543705 net.go:698] Add success.
I0320 00:30:13.468635 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0b779572-25ac-4c60-8935-a325bbea8bda","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 00:30:13.468684 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 00:30:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:30:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:30:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 00:30:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:30:14.456586 543705 disk_worker.go:494] system disk:vda1
I0320 00:30:14.456616 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:30:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:30:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:30:16.458024 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:30:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:30:16.472351 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:30:21.693673 543705 disk_info.go:125] begin check local disk info of client
I0320 00:30:21.696061 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:30:21.696067 543705 disk_info.go:196] parse disk info done, disk is : [0xc000341280 0xc0003412c0]
E0320 00:30:23.410478 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:30:23.410495 543705 memory.go:184] no items to output this cycle
I0320 00:30:23.410510 543705 cpu.go:275] no items to output this cycle
E0320 00:30:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:30:33.409789 543705 memory.go:184] no items to output this cycle
I0320 00:30:33.409814 543705 cpu.go:275] no items to output this cycle
I0320 00:30:37.989733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 00:30:37.989739 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 00:30:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:30:43.410663 543705 memory.go:191] Add success.
I0320 00:30:43.409834 543705 cpu.go:282] Add success.
I0320 00:30:43.420389 543705 net.go:648] Add success.
I0320 00:30:43.423237 543705 net.go:770] primary dev: ETH0
I0320 00:30:43.423250 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:30:43.423262 543705 net.go:698] Add success.
I0320 00:30:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:30:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:30:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:30:53.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:30:53.409806 543705 memory.go:184] no items to output this cycle
I0320 00:30:53.409818 543705 cpu.go:275] no items to output this cycle
E0320 00:31:03.409981 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:31:03.409997 543705 memory.go:184] no items to output this cycle
I0320 00:31:03.410001 543705 cpu.go:275] no items to output this cycle
E0320 00:31:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:31:13.409838 543705 memory.go:191] Add success.
I0320 00:31:13.409844 543705 cpu.go:282] Add success.
W0320 00:31:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:31:13.409889 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:31:13.409893 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:31:13.420206 543705 net.go:648] Add success.
I0320 00:31:13.423087 543705 net.go:770] primary dev: ETH0
I0320 00:31:13.423102 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:31:13.423117 543705 net.go:698] Add success.
I0320 00:31:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:31:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:31:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0320 00:31:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:31:14.456514 543705 disk_worker.go:494] system disk:vda1
I0320 00:31:14.456555 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:31:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:31:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:31:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:31:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:31:16.472386 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:31:21.697175 543705 disk_info.go:125] begin check local disk info of client
I0320 00:31:21.699664 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:31:21.699670 543705 disk_info.go:196] parse disk info done, disk is : [0xc000290340 0xc000290380]
E0320 00:31:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:31:23.409767 543705 memory.go:184] no items to output this cycle
I0320 00:31:23.409773 543705 cpu.go:275] no items to output this cycle
E0320 00:31:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:31:33.409809 543705 memory.go:184] no items to output this cycle
I0320 00:31:33.409834 543705 cpu.go:275] no items to output this cycle
E0320 00:31:43.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:31:43.409832 543705 memory.go:191] Add success.
I0320 00:31:43.409848 543705 cpu.go:282] Add success.
I0320 00:31:43.420077 543705 net.go:648] Add success.
I0320 00:31:43.423075 543705 net.go:770] primary dev: ETH0
I0320 00:31:43.423091 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:31:43.423105 543705 net.go:698] Add success.
I0320 00:31:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:31:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:31:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:31:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:31:53.409777 543705 memory.go:184] no items to output this cycle
I0320 00:31:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 00:32:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:32:03.409804 543705 memory.go:184] no items to output this cycle
I0320 00:32:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 00:32:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:32:13.409784 543705 memory.go:191] Add success.
I0320 00:32:13.409806 543705 cpu.go:282] Add success.
W0320 00:32:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:32:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:32:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:32:13.420116 543705 net.go:648] Add success.
I0320 00:32:13.422693 543705 net.go:770] primary dev: ETH0
I0320 00:32:13.422707 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:32:13.422721 543705 net.go:698] Add success.
W0320 00:32:14.455164 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:32:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0320 00:32:14.455177 543705 disk_worker.go:728] disk inode is not compliant
E0320 00:32:14.456983 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 00:32:14.456993 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 00:32:14.457000 543705 custom_config.go:64] query custom config with name: gpu
I0320 00:32:14.457025 543705 disk_worker.go:494] system disk:vda1
I0320 00:32:14.457065 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 00:32:15.456771 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 00:32:15.456780 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:32:16.457919 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 00:32:16.457919 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 00:32:16.457974 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:32:16.457994 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:32:16.472303 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:32:21.699751 543705 disk_info.go:125] begin check local disk info of client
I0320 00:32:21.702206 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:32:21.702212 543705 disk_info.go:196] parse disk info done, disk is : [0xc000475e00 0xc000475e40]
E0320 00:32:23.409947 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:32:23.409960 543705 cpu.go:275] no items to output this cycle
I0320 00:32:23.409972 543705 memory.go:184] no items to output this cycle
E0320 00:32:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:32:33.409764 543705 memory.go:184] no items to output this cycle
I0320 00:32:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 00:32:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:32:43.409809 543705 memory.go:191] Add success.
I0320 00:32:43.409818 543705 cpu.go:282] Add success.
I0320 00:32:43.420054 543705 net.go:648] Add success.
I0320 00:32:43.423172 543705 net.go:770] primary dev: ETH0
I0320 00:32:43.423189 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:32:43.423208 543705 net.go:698] Add success.
I0320 00:32:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:32:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:32:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:32:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:32:53.409781 543705 cpu.go:275] no items to output this cycle
I0320 00:32:53.409788 543705 memory.go:184] no items to output this cycle
E0320 00:33:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:33:03.409784 543705 cpu.go:275] no items to output this cycle
I0320 00:33:03.409786 543705 memory.go:184] no items to output this cycle
E0320 00:33:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:33:13.409823 543705 memory.go:191] Add success.
I0320 00:33:13.409823 543705 cpu.go:282] Add success.
W0320 00:33:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:33:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:33:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:33:13.420192 543705 net.go:648] Add success.
I0320 00:33:13.423084 543705 net.go:770] primary dev: ETH0
I0320 00:33:13.423108 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:33:13.423120 543705 net.go:698] Add success.
I0320 00:33:13.995331 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"528bb2df-424f-45d6-8169-9d217a43138e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 00:33:13.995366 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 00:33:14.454677 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:33:14.454809 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:33:14.454871 543705 disk_worker.go:708] disk space is not compliant
W0320 00:33:14.454874 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:33:14.456236 543705 disk_worker.go:494] system disk:vda1
I0320 00:33:14.456278 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:33:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:33:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:33:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:33:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:33:16.472376 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:33:21.703160 543705 disk_info.go:125] begin check local disk info of client
I0320 00:33:21.705575 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:33:21.705581 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034c100 0xc00034c140]
E0320 00:33:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:33:23.409788 543705 memory.go:184] no items to output this cycle
I0320 00:33:23.409807 543705 cpu.go:275] no items to output this cycle
E0320 00:33:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:33:33.409780 543705 cpu.go:275] no items to output this cycle
I0320 00:33:33.409782 543705 memory.go:184] no items to output this cycle
I0320 00:33:37.992651 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 00:33:37.992658 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 00:33:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:33:43.410643 543705 memory.go:191] Add success.
I0320 00:33:43.409804 543705 cpu.go:282] Add success.
I0320 00:33:43.420409 543705 net.go:648] Add success.
I0320 00:33:43.423038 543705 net.go:770] primary dev: ETH0
I0320 00:33:43.423051 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:33:43.423064 543705 net.go:698] Add success.
I0320 00:33:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:33:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:33:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:33:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:33:53.409769 543705 memory.go:184] no items to output this cycle
I0320 00:33:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 00:34:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:34:03.409773 543705 memory.go:184] no items to output this cycle
I0320 00:34:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 00:34:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:34:13.409814 543705 memory.go:191] Add success.
I0320 00:34:13.409821 543705 cpu.go:282] Add success.
W0320 00:34:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:34:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:34:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:34:13.420140 543705 net.go:648] Add success.
I0320 00:34:13.423205 543705 net.go:770] primary dev: ETH0
I0320 00:34:13.423220 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:34:13.423234 543705 net.go:698] Add success.
I0320 00:34:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:34:14.455196 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:34:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 00:34:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:34:14.459205 543705 disk_worker.go:494] system disk:vda1
I0320 00:34:14.459237 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:34:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:34:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:34:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:34:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:34:16.472392 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:34:21.705681 543705 disk_info.go:125] begin check local disk info of client
I0320 00:34:21.708213 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:34:21.708220 543705 disk_info.go:196] parse disk info done, disk is : [0xc0005384c0 0xc000538500]
E0320 00:34:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:34:23.409776 543705 memory.go:184] no items to output this cycle
I0320 00:34:23.409782 543705 cpu.go:275] no items to output this cycle
E0320 00:34:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:34:33.409765 543705 memory.go:184] no items to output this cycle
I0320 00:34:33.409805 543705 cpu.go:275] no items to output this cycle
E0320 00:34:43.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:34:43.409832 543705 memory.go:191] Add success.
I0320 00:34:43.409856 543705 cpu.go:282] Add success.
I0320 00:34:43.420145 543705 net.go:648] Add success.
I0320 00:34:43.423212 543705 net.go:770] primary dev: ETH0
I0320 00:34:43.423228 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:34:43.423245 543705 net.go:698] Add success.
I0320 00:34:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:34:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:34:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:34:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:34:53.409775 543705 memory.go:184] no items to output this cycle
I0320 00:34:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 00:35:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:35:03.409803 543705 memory.go:184] no items to output this cycle
I0320 00:35:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 00:35:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:35:13.409788 543705 memory.go:191] Add success.
I0320 00:35:13.409814 543705 cpu.go:282] Add success.
W0320 00:35:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:35:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:35:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:35:13.420055 543705 net.go:648] Add success.
I0320 00:35:13.423189 543705 net.go:770] primary dev: ETH0
I0320 00:35:13.423203 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:35:13.423215 543705 net.go:698] Add success.
I0320 00:35:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:35:14.455096 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:35:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0320 00:35:14.455161 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:35:14.456577 543705 disk_worker.go:494] system disk:vda1
I0320 00:35:14.456618 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:35:15.456005 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:35:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:35:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:35:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:35:16.472392 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:35:21.709248 543705 disk_info.go:125] begin check local disk info of client
I0320 00:35:21.711712 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:35:21.711718 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003302c0 0xc000330300]
E0320 00:35:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:35:23.409792 543705 memory.go:184] no items to output this cycle
I0320 00:35:23.409805 543705 cpu.go:275] no items to output this cycle
E0320 00:35:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:35:33.409800 543705 memory.go:184] no items to output this cycle
I0320 00:35:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 00:35:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:35:43.409789 543705 memory.go:191] Add success.
I0320 00:35:43.409818 543705 cpu.go:282] Add success.
I0320 00:35:43.419971 543705 net.go:648] Add success.
I0320 00:35:43.422765 543705 net.go:770] primary dev: ETH0
I0320 00:35:43.422781 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:35:43.422795 543705 net.go:698] Add success.
I0320 00:35:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:35:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:35:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:35:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:35:53.409767 543705 memory.go:184] no items to output this cycle
I0320 00:35:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 00:36:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:36:03.409799 543705 memory.go:184] no items to output this cycle
I0320 00:36:03.409808 543705 cpu.go:275] no items to output this cycle
E0320 00:36:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:36:13.409787 543705 memory.go:191] Add success.
I0320 00:36:13.409790 543705 cpu.go:282] Add success.
W0320 00:36:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:36:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:36:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:36:13.420062 543705 net.go:648] Add success.
I0320 00:36:13.422904 543705 net.go:770] primary dev: ETH0
I0320 00:36:13.422917 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:36:13.422930 543705 net.go:698] Add success.
I0320 00:36:13.463673 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7de70590-d2d4-4167-a8e3-09df588ddd0e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 00:36:13.463705 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 00:36:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:36:14.455226 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:36:14.455236 543705 disk_worker.go:708] disk space is not compliant
W0320 00:36:14.455238 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:36:14.457160 543705 disk_worker.go:494] system disk:vda1
I0320 00:36:14.457191 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:36:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:36:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:36:16.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:36:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:36:16.472412 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:36:21.713199 543705 disk_info.go:125] begin check local disk info of client
I0320 00:36:21.715695 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:36:21.715702 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004d5940 0xc0004d5980]
E0320 00:36:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:36:23.409796 543705 memory.go:184] no items to output this cycle
I0320 00:36:23.409811 543705 cpu.go:275] no items to output this cycle
E0320 00:36:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:36:33.409798 543705 memory.go:184] no items to output this cycle
I0320 00:36:33.409812 543705 cpu.go:275] no items to output this cycle
I0320 00:36:37.993733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 00:36:37.993739 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 00:36:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:36:43.409798 543705 memory.go:191] Add success.
I0320 00:36:43.409855 543705 cpu.go:282] Add success.
I0320 00:36:43.420254 543705 net.go:648] Add success.
I0320 00:36:43.421165 543705 net.go:770] primary dev: ETH0
I0320 00:36:43.421180 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:36:43.421196 543705 net.go:698] Add success.
I0320 00:36:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:36:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:36:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:36:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:36:53.409771 543705 memory.go:184] no items to output this cycle
I0320 00:36:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 00:37:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:37:03.409800 543705 memory.go:184] no items to output this cycle
I0320 00:37:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 00:37:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:37:13.409820 543705 memory.go:191] Add success.
I0320 00:37:13.409824 543705 cpu.go:282] Add success.
W0320 00:37:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:37:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:37:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:37:13.420185 543705 net.go:648] Add success.
I0320 00:37:13.423127 543705 net.go:770] primary dev: ETH0
I0320 00:37:13.423141 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:37:13.423153 543705 net.go:698] Add success.
I0320 00:37:13.453662 543705 event_worker.go:152] Polling the log file for events...
W0320 00:37:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:37:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0320 00:37:14.455184 543705 disk_worker.go:728] disk inode is not compliant
E0320 00:37:14.455972 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 00:37:14.455981 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 00:37:14.455988 543705 custom_config.go:64] query custom config with name: gpu
I0320 00:37:14.456452 543705 disk_worker.go:494] system disk:vda1
I0320 00:37:14.456481 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 00:37:15.456826 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 00:37:15.456833 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:37:16.457907 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 00:37:16.457909 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 00:37:16.457962 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:37:16.457981 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:37:16.472370 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:37:21.716281 543705 disk_info.go:125] begin check local disk info of client
I0320 00:37:21.718819 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:37:21.718826 543705 disk_info.go:196] parse disk info done, disk is : [0xc000462240 0xc000462280]
E0320 00:37:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:37:23.409795 543705 memory.go:184] no items to output this cycle
I0320 00:37:23.409809 543705 cpu.go:275] no items to output this cycle
E0320 00:37:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:37:33.409798 543705 memory.go:184] no items to output this cycle
I0320 00:37:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 00:37:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:37:43.409795 543705 memory.go:191] Add success.
I0320 00:37:43.409871 543705 cpu.go:282] Add success.
I0320 00:37:43.420124 543705 net.go:648] Add success.
I0320 00:37:43.423123 543705 net.go:770] primary dev: ETH0
I0320 00:37:43.423136 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:37:43.423160 543705 net.go:698] Add success.
I0320 00:37:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:37:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:37:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:37:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:37:53.409771 543705 memory.go:184] no items to output this cycle
I0320 00:37:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 00:38:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:38:03.409796 543705 memory.go:184] no items to output this cycle
I0320 00:38:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 00:38:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:38:13.409786 543705 memory.go:191] Add success.
I0320 00:38:13.409788 543705 cpu.go:282] Add success.
W0320 00:38:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:38:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:38:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:38:13.420202 543705 net.go:648] Add success.
I0320 00:38:13.422945 543705 net.go:770] primary dev: ETH0
I0320 00:38:13.422958 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:38:13.422971 543705 net.go:698] Add success.
I0320 00:38:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:38:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:38:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 00:38:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:38:14.456575 543705 disk_worker.go:494] system disk:vda1
I0320 00:38:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:38:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:38:16.457994 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:38:16.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:38:16.458074 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:38:16.472411 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:38:21.718915 543705 disk_info.go:125] begin check local disk info of client
I0320 00:38:21.721379 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:38:21.721385 543705 disk_info.go:196] parse disk info done, disk is : [0xc000508700 0xc000508740]
E0320 00:38:23.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:38:23.409806 543705 memory.go:184] no items to output this cycle
I0320 00:38:23.409818 543705 cpu.go:275] no items to output this cycle
E0320 00:38:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:38:33.409774 543705 memory.go:184] no items to output this cycle
I0320 00:38:33.409793 543705 cpu.go:275] no items to output this cycle
E0320 00:38:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:38:43.409821 543705 memory.go:191] Add success.
I0320 00:38:43.409824 543705 cpu.go:282] Add success.
I0320 00:38:43.420718 543705 net.go:648] Add success.
I0320 00:38:43.421742 543705 net.go:770] primary dev: ETH0
I0320 00:38:43.421754 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:38:43.421767 543705 net.go:698] Add success.
I0320 00:38:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:38:46.458063 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:38:46.458092 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:38:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:38:53.409767 543705 memory.go:184] no items to output this cycle
I0320 00:38:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 00:39:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:39:03.409776 543705 memory.go:184] no items to output this cycle
I0320 00:39:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 00:39:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:39:13.409821 543705 memory.go:191] Add success.
I0320 00:39:13.409822 543705 cpu.go:282] Add success.
W0320 00:39:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:39:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:39:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:39:13.420315 543705 net.go:648] Add success.
I0320 00:39:13.422998 543705 net.go:770] primary dev: ETH0
I0320 00:39:13.423013 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:39:13.423025 543705 net.go:698] Add success.
I0320 00:39:13.464217 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"04cbde80-612e-4984-8877-61ddba7656fe","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 00:39:13.464250 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 00:39:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:39:14.455128 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:39:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 00:39:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:39:14.456693 543705 disk_worker.go:494] system disk:vda1
I0320 00:39:14.456729 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:39:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:39:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:39:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:39:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:39:16.472430 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:39:21.721684 543705 disk_info.go:125] begin check local disk info of client
I0320 00:39:21.724186 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:39:21.724193 543705 disk_info.go:196] parse disk info done, disk is : [0xc00028c780 0xc00028c7c0]
E0320 00:39:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:39:23.409789 543705 memory.go:184] no items to output this cycle
I0320 00:39:23.409801 543705 cpu.go:275] no items to output this cycle
E0320 00:39:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:39:33.409781 543705 memory.go:184] no items to output this cycle
I0320 00:39:33.409787 543705 cpu.go:275] no items to output this cycle
I0320 00:39:37.996676 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 00:39:37.996683 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 00:39:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:39:43.409788 543705 memory.go:191] Add success.
I0320 00:39:43.409850 543705 cpu.go:282] Add success.
I0320 00:39:43.420085 543705 net.go:648] Add success.
I0320 00:39:43.421044 543705 net.go:770] primary dev: ETH0
I0320 00:39:43.421061 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:39:43.421074 543705 net.go:698] Add success.
I0320 00:39:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:39:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:39:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:39:53.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:39:53.409765 543705 memory.go:184] no items to output this cycle
I0320 00:39:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 00:40:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:40:03.409797 543705 memory.go:184] no items to output this cycle
I0320 00:40:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 00:40:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:40:13.409814 543705 memory.go:191] Add success.
I0320 00:40:13.409818 543705 cpu.go:282] Add success.
W0320 00:40:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:40:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:40:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:40:13.420057 543705 net.go:648] Add success.
I0320 00:40:13.422765 543705 net.go:770] primary dev: ETH0
I0320 00:40:13.422777 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:40:13.422789 543705 net.go:698] Add success.
I0320 00:40:14.454980 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:40:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:40:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0320 00:40:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:40:14.456633 543705 disk_worker.go:494] system disk:vda1
I0320 00:40:14.456665 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:40:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:40:16.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:40:16.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:40:16.458081 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:40:16.472436 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:40:21.724278 543705 disk_info.go:125] begin check local disk info of client
I0320 00:40:21.726879 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:40:21.726887 543705 disk_info.go:196] parse disk info done, disk is : [0xc000358100 0xc000358140]
E0320 00:40:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:40:23.409759 543705 memory.go:184] no items to output this cycle
I0320 00:40:23.409789 543705 cpu.go:275] no items to output this cycle
E0320 00:40:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:40:33.409778 543705 memory.go:184] no items to output this cycle
I0320 00:40:33.409780 543705 cpu.go:275] no items to output this cycle
E0320 00:40:43.409808 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:40:43.409825 543705 cpu.go:282] Add success.
I0320 00:40:43.409839 543705 memory.go:191] Add success.
I0320 00:40:43.420087 543705 net.go:648] Add success.
I0320 00:40:43.423050 543705 net.go:770] primary dev: ETH0
I0320 00:40:43.423064 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:40:43.423076 543705 net.go:698] Add success.
I0320 00:40:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:40:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:40:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:40:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:40:53.409777 543705 memory.go:184] no items to output this cycle
I0320 00:40:53.409784 543705 cpu.go:275] no items to output this cycle
E0320 00:41:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:41:03.409782 543705 memory.go:184] no items to output this cycle
I0320 00:41:03.409784 543705 cpu.go:275] no items to output this cycle
E0320 00:41:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:41:13.409826 543705 memory.go:191] Add success.
I0320 00:41:13.409828 543705 cpu.go:282] Add success.
W0320 00:41:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:41:13.409876 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:41:13.409881 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:41:13.420364 543705 net.go:648] Add success.
I0320 00:41:13.423333 543705 net.go:770] primary dev: ETH0
I0320 00:41:13.423349 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:41:13.423363 543705 net.go:698] Add success.
I0320 00:41:14.454985 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:41:14.455215 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:41:14.455229 543705 disk_worker.go:708] disk space is not compliant
W0320 00:41:14.455232 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:41:14.456579 543705 disk_worker.go:494] system disk:vda1
I0320 00:41:14.456615 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:41:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:41:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:41:16.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:41:16.458082 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:41:16.472459 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:41:21.728279 543705 disk_info.go:125] begin check local disk info of client
I0320 00:41:21.730812 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:41:21.730818 543705 disk_info.go:196] parse disk info done, disk is : [0xc000498400 0xc000498440]
E0320 00:41:23.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:41:23.409758 543705 memory.go:184] no items to output this cycle
I0320 00:41:23.409795 543705 cpu.go:275] no items to output this cycle
E0320 00:41:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:41:33.409769 543705 memory.go:184] no items to output this cycle
I0320 00:41:33.409805 543705 cpu.go:275] no items to output this cycle
E0320 00:41:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:41:43.409805 543705 memory.go:191] Add success.
I0320 00:41:43.409851 543705 cpu.go:282] Add success.
I0320 00:41:43.420051 543705 net.go:648] Add success.
I0320 00:41:43.423107 543705 net.go:770] primary dev: ETH0
I0320 00:41:43.423125 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:41:43.423138 543705 net.go:698] Add success.
I0320 00:41:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:41:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:41:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:41:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:41:53.409782 543705 cpu.go:275] no items to output this cycle
I0320 00:41:53.409782 543705 memory.go:184] no items to output this cycle
E0320 00:42:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:42:03.409770 543705 memory.go:184] no items to output this cycle
I0320 00:42:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 00:42:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:42:13.409815 543705 memory.go:191] Add success.
I0320 00:42:13.409817 543705 cpu.go:282] Add success.
W0320 00:42:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:42:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:42:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:42:13.420153 543705 net.go:648] Add success.
I0320 00:42:13.422938 543705 net.go:770] primary dev: ETH0
I0320 00:42:13.422952 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:42:13.422967 543705 net.go:698] Add success.
I0320 00:42:13.469032 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"20c14cf0-f869-4f35-a2e7-b4d441228f3a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 00:42:13.469065 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 00:42:14.455262 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:42:14.455280 543705 disk_worker.go:708] disk space is not compliant
W0320 00:42:14.455284 543705 disk_worker.go:728] disk inode is not compliant
E0320 00:42:14.456228 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 00:42:14.456238 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 00:42:14.456244 543705 custom_config.go:64] query custom config with name: gpu
I0320 00:42:14.457198 543705 disk_worker.go:494] system disk:vda1
I0320 00:42:14.457234 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 00:42:15.456860 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 00:42:15.456870 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:42:16.457942 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 00:42:16.457942 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 00:42:16.457995 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:42:16.458013 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:42:16.472354 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:42:21.731348 543705 disk_info.go:125] begin check local disk info of client
I0320 00:42:21.733835 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:42:21.733842 543705 disk_info.go:196] parse disk info done, disk is : [0xc000394640 0xc000394680]
E0320 00:42:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:42:23.409762 543705 memory.go:184] no items to output this cycle
I0320 00:42:23.409786 543705 cpu.go:275] no items to output this cycle
E0320 00:42:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:42:33.409769 543705 memory.go:184] no items to output this cycle
I0320 00:42:33.409803 543705 cpu.go:275] no items to output this cycle
I0320 00:42:37.997729 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 00:42:37.997735 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 00:42:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:42:43.410655 543705 memory.go:191] Add success.
I0320 00:42:43.409859 543705 cpu.go:282] Add success.
I0320 00:42:43.420421 543705 net.go:648] Add success.
I0320 00:42:43.423137 543705 net.go:770] primary dev: ETH0
I0320 00:42:43.423151 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:42:43.423164 543705 net.go:698] Add success.
I0320 00:42:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:42:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:42:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:42:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:42:53.409780 543705 memory.go:184] no items to output this cycle
I0320 00:42:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 00:43:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:43:03.409771 543705 memory.go:184] no items to output this cycle
I0320 00:43:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 00:43:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:43:13.409816 543705 memory.go:191] Add success.
I0320 00:43:13.409822 543705 cpu.go:282] Add success.
W0320 00:43:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:43:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:43:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:43:13.420293 543705 net.go:648] Add success.
I0320 00:43:13.423139 543705 net.go:770] primary dev: ETH0
I0320 00:43:13.423153 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:43:13.423165 543705 net.go:698] Add success.
I0320 00:43:14.454387 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:43:14.454533 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:43:14.454615 543705 disk_worker.go:708] disk space is not compliant
W0320 00:43:14.454618 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:43:14.455967 543705 disk_worker.go:494] system disk:vda1
I0320 00:43:14.456000 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:43:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:43:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:43:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:43:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:43:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:43:21.734728 543705 disk_info.go:125] begin check local disk info of client
I0320 00:43:21.737203 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:43:21.737210 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037e280 0xc00037e2c0]
E0320 00:43:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:43:23.409767 543705 memory.go:184] no items to output this cycle
I0320 00:43:23.409798 543705 cpu.go:275] no items to output this cycle
E0320 00:43:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:43:33.409765 543705 memory.go:184] no items to output this cycle
I0320 00:43:33.409799 543705 cpu.go:275] no items to output this cycle
E0320 00:43:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:43:43.409813 543705 memory.go:191] Add success.
I0320 00:43:43.409817 543705 cpu.go:282] Add success.
I0320 00:43:43.419908 543705 net.go:648] Add success.
I0320 00:43:43.422509 543705 net.go:770] primary dev: ETH0
I0320 00:43:43.422523 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:43:43.422537 543705 net.go:698] Add success.
I0320 00:43:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:43:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:43:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:43:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:43:53.409780 543705 memory.go:184] no items to output this cycle
I0320 00:43:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 00:44:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:44:03.409799 543705 memory.go:184] no items to output this cycle
I0320 00:44:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 00:44:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:44:13.409804 543705 memory.go:191] Add success.
I0320 00:44:13.409812 543705 cpu.go:282] Add success.
W0320 00:44:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:44:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:44:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:44:13.420116 543705 net.go:648] Add success.
I0320 00:44:13.423049 543705 net.go:770] primary dev: ETH0
I0320 00:44:13.423062 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:44:13.423074 543705 net.go:698] Add success.
I0320 00:44:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:44:14.455194 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:44:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 00:44:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:44:14.456611 543705 disk_worker.go:494] system disk:vda1
I0320 00:44:14.456651 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:44:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:44:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:44:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:44:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:44:16.472404 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:44:21.737679 543705 disk_info.go:125] begin check local disk info of client
I0320 00:44:21.740073 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:44:21.740079 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048c180 0xc00048c1c0]
E0320 00:44:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:44:23.409790 543705 memory.go:184] no items to output this cycle
I0320 00:44:23.409803 543705 cpu.go:275] no items to output this cycle
E0320 00:44:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:44:33.409783 543705 memory.go:184] no items to output this cycle
I0320 00:44:33.409793 543705 cpu.go:275] no items to output this cycle
E0320 00:44:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:44:43.409793 543705 memory.go:191] Add success.
I0320 00:44:43.409794 543705 cpu.go:282] Add success.
I0320 00:44:43.419889 543705 net.go:648] Add success.
I0320 00:44:43.422919 543705 net.go:770] primary dev: ETH0
I0320 00:44:43.422934 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:44:43.422949 543705 net.go:698] Add success.
I0320 00:44:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:44:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:44:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:44:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:44:53.409793 543705 memory.go:184] no items to output this cycle
I0320 00:44:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 00:45:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:45:03.409786 543705 memory.go:184] no items to output this cycle
I0320 00:45:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 00:45:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:45:13.409825 543705 memory.go:191] Add success.
I0320 00:45:13.409832 543705 cpu.go:282] Add success.
W0320 00:45:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:45:13.409873 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:45:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:45:13.420155 543705 net.go:648] Add success.
I0320 00:45:13.423234 543705 net.go:770] primary dev: ETH0
I0320 00:45:13.423248 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:45:13.423269 543705 net.go:698] Add success.
I0320 00:45:13.469243 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9c9475e8-b8a3-45ff-9676-a24e85778715","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 00:45:13.469278 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 00:45:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:45:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:45:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0320 00:45:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:45:14.456512 543705 disk_worker.go:494] system disk:vda1
I0320 00:45:14.456557 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:45:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:45:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:45:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:45:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:45:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:45:21.741401 543705 disk_info.go:125] begin check local disk info of client
I0320 00:45:21.743876 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:45:21.743882 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003f2080 0xc0003f20c0]
E0320 00:45:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:45:23.409780 543705 memory.go:184] no items to output this cycle
I0320 00:45:23.409784 543705 cpu.go:275] no items to output this cycle
E0320 00:45:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:45:33.409805 543705 memory.go:184] no items to output this cycle
I0320 00:45:33.409818 543705 cpu.go:275] no items to output this cycle
I0320 00:45:38.000692 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 00:45:38.000699 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 00:45:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:45:43.410770 543705 memory.go:191] Add success.
I0320 00:45:43.409809 543705 cpu.go:282] Add success.
I0320 00:45:43.420495 543705 net.go:648] Add success.
I0320 00:45:43.423298 543705 net.go:770] primary dev: ETH0
I0320 00:45:43.423312 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:45:43.423326 543705 net.go:698] Add success.
I0320 00:45:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:45:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:45:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:45:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:45:53.409783 543705 memory.go:184] no items to output this cycle
I0320 00:45:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 00:46:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:46:03.409789 543705 memory.go:184] no items to output this cycle
I0320 00:46:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 00:46:13.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:46:13.409834 543705 memory.go:191] Add success.
I0320 00:46:13.409842 543705 cpu.go:282] Add success.
W0320 00:46:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:46:13.409883 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:46:13.409887 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:46:13.420063 543705 net.go:648] Add success.
I0320 00:46:13.422720 543705 net.go:770] primary dev: ETH0
I0320 00:46:13.422736 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:46:13.422750 543705 net.go:698] Add success.
I0320 00:46:14.454952 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:46:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:46:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0320 00:46:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:46:14.456558 543705 disk_worker.go:494] system disk:vda1
I0320 00:46:14.456591 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:46:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:46:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:46:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:46:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:46:16.472381 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:46:21.745366 543705 disk_info.go:125] begin check local disk info of client
I0320 00:46:21.747846 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:46:21.747852 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034a8c0 0xc00034a900]
E0320 00:46:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:46:23.409772 543705 memory.go:184] no items to output this cycle
I0320 00:46:23.409777 543705 cpu.go:275] no items to output this cycle
E0320 00:46:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:46:33.409785 543705 memory.go:184] no items to output this cycle
I0320 00:46:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 00:46:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:46:43.409791 543705 memory.go:191] Add success.
I0320 00:46:43.409812 543705 cpu.go:282] Add success.
I0320 00:46:43.419966 543705 net.go:648] Add success.
I0320 00:46:43.422688 543705 net.go:770] primary dev: ETH0
I0320 00:46:43.422700 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:46:43.422713 543705 net.go:698] Add success.
I0320 00:46:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:46:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:46:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:46:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:46:53.409773 543705 memory.go:184] no items to output this cycle
I0320 00:46:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 00:47:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:47:03.409800 543705 memory.go:184] no items to output this cycle
I0320 00:47:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 00:47:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:47:13.409793 543705 memory.go:191] Add success.
I0320 00:47:13.409814 543705 cpu.go:282] Add success.
W0320 00:47:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:47:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:47:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:47:13.420392 543705 net.go:648] Add success.
I0320 00:47:13.423081 543705 net.go:770] primary dev: ETH0
I0320 00:47:13.423096 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:47:13.423107 543705 net.go:698] Add success.
I0320 00:47:13.453666 543705 event_worker.go:152] Polling the log file for events...
W0320 00:47:14.455105 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:47:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0320 00:47:14.455166 543705 disk_worker.go:728] disk inode is not compliant
E0320 00:47:14.455983 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 00:47:14.455992 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 00:47:14.455998 543705 custom_config.go:64] query custom config with name: gpu
I0320 00:47:14.456460 543705 disk_worker.go:494] system disk:vda1
I0320 00:47:14.456490 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 00:47:15.456788 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 00:47:15.456798 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:47:16.457929 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 00:47:16.457929 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 00:47:16.457983 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:47:16.458004 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:47:16.472341 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:47:21.749379 543705 disk_info.go:125] begin check local disk info of client
I0320 00:47:21.751809 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:47:21.751814 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037cb00 0xc00037cb40]
E0320 00:47:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:47:23.409784 543705 memory.go:184] no items to output this cycle
I0320 00:47:23.409798 543705 cpu.go:275] no items to output this cycle
E0320 00:47:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:47:33.409777 543705 memory.go:184] no items to output this cycle
I0320 00:47:33.409806 543705 cpu.go:275] no items to output this cycle
E0320 00:47:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:47:43.409808 543705 memory.go:191] Add success.
I0320 00:47:43.409814 543705 cpu.go:282] Add success.
I0320 00:47:43.419973 543705 net.go:648] Add success.
I0320 00:47:43.422765 543705 net.go:770] primary dev: ETH0
I0320 00:47:43.422777 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:47:43.422789 543705 net.go:698] Add success.
I0320 00:47:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:47:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:47:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:47:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:47:53.409777 543705 memory.go:184] no items to output this cycle
I0320 00:47:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 00:48:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:48:03.409800 543705 memory.go:184] no items to output this cycle
I0320 00:48:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 00:48:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:48:13.409830 543705 memory.go:191] Add success.
I0320 00:48:13.409830 543705 cpu.go:282] Add success.
W0320 00:48:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:48:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:48:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:48:13.420210 543705 net.go:648] Add success.
I0320 00:48:13.422976 543705 net.go:770] primary dev: ETH0
I0320 00:48:13.422990 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:48:13.423002 543705 net.go:698] Add success.
I0320 00:48:13.469476 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3ea3bb68-61fa-4c56-9321-4f0153a80e3a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 00:48:13.469519 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 00:48:14.454951 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:48:14.455148 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:48:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0320 00:48:14.455161 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:48:14.456775 543705 disk_worker.go:494] system disk:vda1
I0320 00:48:14.456817 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:48:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:48:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:48:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:48:16.458079 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:48:16.472421 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:48:21.753398 543705 disk_info.go:125] begin check local disk info of client
I0320 00:48:21.755812 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:48:21.755818 543705 disk_info.go:196] parse disk info done, disk is : [0xc000348380 0xc0003483c0]
E0320 00:48:23.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:48:23.409816 543705 memory.go:184] no items to output this cycle
I0320 00:48:23.409822 543705 cpu.go:275] no items to output this cycle
E0320 00:48:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:48:33.409783 543705 cpu.go:275] no items to output this cycle
I0320 00:48:33.409792 543705 memory.go:184] no items to output this cycle
I0320 00:48:38.001736 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 00:48:38.001742 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 00:48:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:48:43.410596 543705 memory.go:191] Add success.
I0320 00:48:43.409820 543705 cpu.go:282] Add success.
I0320 00:48:43.420348 543705 net.go:648] Add success.
I0320 00:48:43.422966 543705 net.go:770] primary dev: ETH0
I0320 00:48:43.422978 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:48:43.422991 543705 net.go:698] Add success.
I0320 00:48:46.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:48:46.458070 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:48:46.458099 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:48:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:48:53.409790 543705 memory.go:184] no items to output this cycle
I0320 00:48:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 00:49:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:49:03.409785 543705 cpu.go:275] no items to output this cycle
I0320 00:49:03.409788 543705 memory.go:184] no items to output this cycle
W0320 00:49:13.409706 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:49:13.409722 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:49:13.409727 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0320 00:49:13.409803 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:49:13.409821 543705 memory.go:191] Add success.
I0320 00:49:13.409829 543705 cpu.go:282] Add success.
I0320 00:49:13.420140 543705 net.go:648] Add success.
I0320 00:49:13.422923 543705 net.go:770] primary dev: ETH0
I0320 00:49:13.422938 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:49:13.422950 543705 net.go:698] Add success.
I0320 00:49:14.453983 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:49:14.454233 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:49:14.454242 543705 disk_worker.go:708] disk space is not compliant
W0320 00:49:14.454245 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:49:14.456066 543705 disk_worker.go:494] system disk:vda1
I0320 00:49:14.456105 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:49:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:49:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:49:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:49:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:49:16.472414 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:49:21.755904 543705 disk_info.go:125] begin check local disk info of client
I0320 00:49:21.758363 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:49:21.758369 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034f280 0xc00034f2c0]
E0320 00:49:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:49:23.409789 543705 memory.go:184] no items to output this cycle
I0320 00:49:23.409801 543705 cpu.go:275] no items to output this cycle
E0320 00:49:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:49:33.409775 543705 cpu.go:275] no items to output this cycle
I0320 00:49:33.409786 543705 memory.go:184] no items to output this cycle
E0320 00:49:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:49:43.409810 543705 memory.go:191] Add success.
I0320 00:49:43.409819 543705 cpu.go:282] Add success.
I0320 00:49:43.419953 543705 net.go:648] Add success.
I0320 00:49:43.422635 543705 net.go:770] primary dev: ETH0
I0320 00:49:43.422653 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:49:43.422668 543705 net.go:698] Add success.
I0320 00:49:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:49:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:49:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:49:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:49:53.409784 543705 memory.go:184] no items to output this cycle
I0320 00:49:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 00:50:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:50:03.409779 543705 memory.go:184] no items to output this cycle
I0320 00:50:03.409784 543705 cpu.go:275] no items to output this cycle
E0320 00:50:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:50:13.409782 543705 memory.go:191] Add success.
W0320 00:50:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 00:50:13.409815 543705 cpu.go:282] Add success.
W0320 00:50:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:50:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:50:13.420217 543705 net.go:648] Add success.
I0320 00:50:13.422982 543705 net.go:770] primary dev: ETH0
I0320 00:50:13.422995 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:50:13.423007 543705 net.go:698] Add success.
I0320 00:50:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:50:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:50:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0320 00:50:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:50:14.456831 543705 disk_worker.go:494] system disk:vda1
I0320 00:50:14.456874 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:50:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:50:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:50:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:50:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:50:16.472377 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:50:21.759482 543705 disk_info.go:125] begin check local disk info of client
I0320 00:50:21.761949 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:50:21.761955 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003662c0 0xc000366300]
E0320 00:50:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:50:23.409758 543705 memory.go:184] no items to output this cycle
I0320 00:50:23.409794 543705 cpu.go:275] no items to output this cycle
E0320 00:50:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:50:33.409765 543705 memory.go:184] no items to output this cycle
I0320 00:50:33.409800 543705 cpu.go:275] no items to output this cycle
E0320 00:50:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:50:43.409794 543705 memory.go:191] Add success.
I0320 00:50:43.409803 543705 cpu.go:282] Add success.
I0320 00:50:43.419970 543705 net.go:648] Add success.
I0320 00:50:43.422731 543705 net.go:770] primary dev: ETH0
I0320 00:50:43.422747 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:50:43.422761 543705 net.go:698] Add success.
I0320 00:50:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:50:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:50:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:50:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:50:53.409770 543705 memory.go:184] no items to output this cycle
I0320 00:50:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 00:51:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:51:03.409800 543705 memory.go:184] no items to output this cycle
I0320 00:51:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 00:51:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:51:13.409795 543705 memory.go:191] Add success.
I0320 00:51:13.409794 543705 cpu.go:282] Add success.
W0320 00:51:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:51:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:51:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:51:13.420112 543705 net.go:648] Add success.
I0320 00:51:13.423146 543705 net.go:770] primary dev: ETH0
I0320 00:51:13.423159 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:51:13.423173 543705 net.go:698] Add success.
I0320 00:51:13.468051 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1d52854f-90cb-4e38-b882-fccf4a1d49d3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 00:51:13.468083 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 00:51:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:51:14.455118 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:51:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0320 00:51:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:51:14.456511 543705 disk_worker.go:494] system disk:vda1
I0320 00:51:14.456554 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:51:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:51:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:51:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:51:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:51:16.472443 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:51:21.763443 543705 disk_info.go:125] begin check local disk info of client
I0320 00:51:21.765882 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:51:21.765888 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b2a00 0xc0003b2a40]
E0320 00:51:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:51:23.409795 543705 memory.go:184] no items to output this cycle
I0320 00:51:23.409806 543705 cpu.go:275] no items to output this cycle
E0320 00:51:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:51:33.409768 543705 memory.go:184] no items to output this cycle
I0320 00:51:33.409798 543705 cpu.go:275] no items to output this cycle
I0320 00:51:38.004709 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 00:51:38.004714 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 00:51:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:51:43.410724 543705 memory.go:191] Add success.
I0320 00:51:43.409810 543705 cpu.go:282] Add success.
I0320 00:51:43.420411 543705 net.go:648] Add success.
I0320 00:51:43.423289 543705 net.go:770] primary dev: ETH0
I0320 00:51:43.423303 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:51:43.423316 543705 net.go:698] Add success.
I0320 00:51:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:51:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:51:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:51:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:51:53.409773 543705 memory.go:184] no items to output this cycle
I0320 00:51:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 00:52:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:52:03.409801 543705 memory.go:184] no items to output this cycle
I0320 00:52:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 00:52:13.409900 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:52:13.409929 543705 cpu.go:282] Add success.
I0320 00:52:13.409931 543705 memory.go:191] Add success.
W0320 00:52:13.409973 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:52:13.409995 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:52:13.410000 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:52:13.419750 543705 net.go:648] Add success.
I0320 00:52:13.422444 543705 net.go:770] primary dev: ETH0
I0320 00:52:13.422458 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:52:13.422469 543705 net.go:698] Add success.
W0320 00:52:14.455174 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:52:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 00:52:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:52:14.456823 543705 disk_worker.go:494] system disk:vda1
I0320 00:52:14.456863 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 00:52:14.457145 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 00:52:14.457153 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 00:52:14.457158 543705 custom_config.go:64] query custom config with name: gpu
E0320 00:52:15.456836 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 00:52:15.456845 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:52:16.457917 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 00:52:16.457917 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 00:52:16.457970 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:52:16.457989 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:52:16.472307 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:52:21.766729 543705 disk_info.go:125] begin check local disk info of client
I0320 00:52:21.769111 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:52:21.769117 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003c8440 0xc0003c8480]
E0320 00:52:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:52:23.409788 543705 memory.go:184] no items to output this cycle
I0320 00:52:23.409800 543705 cpu.go:275] no items to output this cycle
E0320 00:52:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:52:33.409786 543705 memory.go:184] no items to output this cycle
I0320 00:52:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 00:52:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:52:43.409797 543705 cpu.go:282] Add success.
I0320 00:52:43.409800 543705 memory.go:191] Add success.
I0320 00:52:43.419844 543705 net.go:648] Add success.
I0320 00:52:43.422568 543705 net.go:770] primary dev: ETH0
I0320 00:52:43.422582 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:52:43.422594 543705 net.go:698] Add success.
I0320 00:52:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:52:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:52:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:52:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:52:53.409788 543705 memory.go:184] no items to output this cycle
I0320 00:52:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 00:53:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:53:03.409782 543705 memory.go:184] no items to output this cycle
I0320 00:53:03.409808 543705 cpu.go:275] no items to output this cycle
E0320 00:53:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:53:13.409812 543705 memory.go:191] Add success.
I0320 00:53:13.409813 543705 cpu.go:282] Add success.
W0320 00:53:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:53:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:53:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:53:13.420225 543705 net.go:648] Add success.
I0320 00:53:13.423043 543705 net.go:770] primary dev: ETH0
I0320 00:53:13.423058 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:53:13.423072 543705 net.go:698] Add success.
I0320 00:53:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:53:14.455137 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:53:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0320 00:53:14.455218 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:53:14.456592 543705 disk_worker.go:494] system disk:vda1
I0320 00:53:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:53:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:53:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:53:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:53:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:53:16.472367 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:53:21.769676 543705 disk_info.go:125] begin check local disk info of client
I0320 00:53:21.772163 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:53:21.772170 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003673c0 0xc000367400]
E0320 00:53:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:53:23.409773 543705 memory.go:184] no items to output this cycle
I0320 00:53:23.409806 543705 cpu.go:275] no items to output this cycle
E0320 00:53:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:53:33.409805 543705 memory.go:184] no items to output this cycle
I0320 00:53:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 00:53:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:53:43.409788 543705 memory.go:191] Add success.
I0320 00:53:43.409812 543705 cpu.go:282] Add success.
I0320 00:53:43.419884 543705 net.go:648] Add success.
I0320 00:53:43.423084 543705 net.go:770] primary dev: ETH0
I0320 00:53:43.423099 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:53:43.423114 543705 net.go:698] Add success.
I0320 00:53:46.457994 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:53:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:53:46.458094 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:53:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:53:53.409774 543705 memory.go:184] no items to output this cycle
I0320 00:53:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 00:54:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:54:03.409784 543705 memory.go:184] no items to output this cycle
I0320 00:54:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 00:54:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:54:13.409802 543705 memory.go:191] Add success.
I0320 00:54:13.409803 543705 cpu.go:282] Add success.
W0320 00:54:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:54:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:54:13.409859 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:54:13.420366 543705 net.go:648] Add success.
I0320 00:54:13.423251 543705 net.go:770] primary dev: ETH0
I0320 00:54:13.423264 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:54:13.423277 543705 net.go:698] Add success.
I0320 00:54:13.468184 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7bf02f9d-b0ea-4856-ab65-f23e8f762015","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 00:54:13.468216 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 00:54:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:54:14.455104 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:54:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 00:54:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:54:14.456491 543705 disk_worker.go:494] system disk:vda1
I0320 00:54:14.456537 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:54:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:54:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:54:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:54:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:54:16.472402 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:54:21.773504 543705 disk_info.go:125] begin check local disk info of client
I0320 00:54:21.775931 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:54:21.775937 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4580 0xc0000c45c0]
E0320 00:54:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:54:23.409790 543705 memory.go:184] no items to output this cycle
I0320 00:54:23.409805 543705 cpu.go:275] no items to output this cycle
E0320 00:54:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:54:33.409798 543705 memory.go:184] no items to output this cycle
I0320 00:54:33.409817 543705 cpu.go:275] no items to output this cycle
I0320 00:54:38.005739 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 00:54:38.005745 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 00:54:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:54:43.410597 543705 memory.go:191] Add success.
I0320 00:54:43.409786 543705 cpu.go:282] Add success.
I0320 00:54:43.420277 543705 net.go:648] Add success.
I0320 00:54:43.422805 543705 net.go:770] primary dev: ETH0
I0320 00:54:43.422819 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:54:43.422831 543705 net.go:698] Add success.
I0320 00:54:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:54:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:54:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:54:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:54:53.409771 543705 memory.go:184] no items to output this cycle
I0320 00:54:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 00:55:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:55:03.409840 543705 memory.go:184] no items to output this cycle
I0320 00:55:03.409928 543705 cpu.go:275] no items to output this cycle
E0320 00:55:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:55:13.409790 543705 memory.go:191] Add success.
W0320 00:55:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 00:55:13.409815 543705 cpu.go:282] Add success.
W0320 00:55:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:55:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:55:13.420145 543705 net.go:648] Add success.
I0320 00:55:13.422983 543705 net.go:770] primary dev: ETH0
I0320 00:55:13.422996 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:55:13.423008 543705 net.go:698] Add success.
I0320 00:55:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:55:14.455198 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:55:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0320 00:55:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:55:14.456589 543705 disk_worker.go:494] system disk:vda1
I0320 00:55:14.456618 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:55:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:55:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:55:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:55:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:55:16.472369 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:55:21.776019 543705 disk_info.go:125] begin check local disk info of client
I0320 00:55:21.778442 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:55:21.778448 543705 disk_info.go:196] parse disk info done, disk is : [0xc000330380 0xc0003303c0]
E0320 00:55:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:55:23.409782 543705 memory.go:184] no items to output this cycle
I0320 00:55:23.409793 543705 cpu.go:275] no items to output this cycle
E0320 00:55:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:55:33.409797 543705 memory.go:184] no items to output this cycle
I0320 00:55:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 00:55:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:55:43.409795 543705 memory.go:191] Add success.
I0320 00:55:43.409796 543705 cpu.go:282] Add success.
I0320 00:55:43.419871 543705 net.go:648] Add success.
I0320 00:55:43.422945 543705 net.go:770] primary dev: ETH0
I0320 00:55:43.422958 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:55:43.422970 543705 net.go:698] Add success.
I0320 00:55:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:55:46.458062 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:55:46.458090 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:55:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:55:53.409786 543705 memory.go:184] no items to output this cycle
I0320 00:55:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 00:56:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:56:03.409899 543705 memory.go:184] no items to output this cycle
I0320 00:56:03.409926 543705 cpu.go:275] no items to output this cycle
E0320 00:56:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:56:13.409817 543705 memory.go:191] Add success.
I0320 00:56:13.409820 543705 cpu.go:282] Add success.
W0320 00:56:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:56:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:56:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:56:13.420434 543705 net.go:648] Add success.
I0320 00:56:13.423580 543705 net.go:770] primary dev: ETH0
I0320 00:56:13.423593 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:56:13.423605 543705 net.go:698] Add success.
I0320 00:56:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:56:14.455091 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:56:14.455153 543705 disk_worker.go:708] disk space is not compliant
W0320 00:56:14.455156 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:56:14.456505 543705 disk_worker.go:494] system disk:vda1
I0320 00:56:14.456549 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:56:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:56:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:56:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:56:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:56:16.472381 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:56:21.779540 543705 disk_info.go:125] begin check local disk info of client
I0320 00:56:21.781944 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:56:21.781950 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007af40 0xc00007af80]
E0320 00:56:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:56:23.409784 543705 memory.go:184] no items to output this cycle
I0320 00:56:23.409798 543705 cpu.go:275] no items to output this cycle
E0320 00:56:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:56:33.409780 543705 memory.go:184] no items to output this cycle
I0320 00:56:33.409785 543705 cpu.go:275] no items to output this cycle
E0320 00:56:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:56:43.409785 543705 memory.go:191] Add success.
I0320 00:56:43.409804 543705 cpu.go:282] Add success.
I0320 00:56:43.419858 543705 net.go:648] Add success.
I0320 00:56:43.422281 543705 net.go:770] primary dev: ETH0
I0320 00:56:43.422296 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:56:43.422309 543705 net.go:698] Add success.
I0320 00:56:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:56:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:56:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:56:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:56:53.409767 543705 memory.go:184] no items to output this cycle
I0320 00:56:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 00:57:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:57:03.409784 543705 memory.go:184] no items to output this cycle
I0320 00:57:03.409787 543705 cpu.go:275] no items to output this cycle
W0320 00:57:13.409720 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:57:13.409739 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:57:13.409746 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:57:13.409812 543705 cpu.go:282] Add success.
E0320 00:57:13.409846 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:57:13.409871 543705 memory.go:191] Add success.
I0320 00:57:13.420134 543705 net.go:648] Add success.
I0320 00:57:13.429000 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 00:57:13.429091 543705 net.go:770] primary dev: ETH0
I0320 00:57:13.429106 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:57:13.429119 543705 net.go:698] Add success.
I0320 00:57:13.453677 543705 event_worker.go:152] Polling the log file for events...
I0320 00:57:13.464725 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"49cb3540-98de-4324-b70f-3c85bfac55ad","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 00:57:13.464758 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 00:57:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:57:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 00:57:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:57:14.456921 543705 disk_worker.go:494] system disk:vda1
I0320 00:57:14.456961 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 00:57:14.457088 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 00:57:14.457095 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 00:57:14.457098 543705 custom_config.go:64] query custom config with name: gpu
E0320 00:57:15.456788 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 00:57:15.456796 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:57:16.457901 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 00:57:16.457901 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 00:57:16.457956 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:57:16.457976 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:57:16.472313 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:57:21.783552 543705 disk_info.go:125] begin check local disk info of client
I0320 00:57:21.786063 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:57:21.786069 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5e40 0xc0000c5e80]
E0320 00:57:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:57:23.409766 543705 cpu.go:275] no items to output this cycle
I0320 00:57:23.409776 543705 memory.go:184] no items to output this cycle
E0320 00:57:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:57:33.409795 543705 memory.go:184] no items to output this cycle
I0320 00:57:33.409806 543705 cpu.go:275] no items to output this cycle
I0320 00:57:38.008728 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 00:57:38.008734 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 00:57:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:57:43.410672 543705 memory.go:191] Add success.
I0320 00:57:43.409799 543705 cpu.go:282] Add success.
I0320 00:57:43.420344 543705 net.go:648] Add success.
I0320 00:57:43.423260 543705 net.go:770] primary dev: ETH0
I0320 00:57:43.423273 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:57:43.423284 543705 net.go:698] Add success.
I0320 00:57:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:57:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:57:46.458094 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:57:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:57:53.409809 543705 memory.go:184] no items to output this cycle
I0320 00:57:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 00:58:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:58:03.409776 543705 memory.go:184] no items to output this cycle
I0320 00:58:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 00:58:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:58:13.409798 543705 memory.go:191] Add success.
I0320 00:58:13.409804 543705 cpu.go:282] Add success.
W0320 00:58:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:58:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:58:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:58:13.420190 543705 net.go:648] Add success.
I0320 00:58:13.422770 543705 net.go:770] primary dev: ETH0
I0320 00:58:13.422783 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:58:13.422795 543705 net.go:698] Add success.
I0320 00:58:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:58:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:58:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 00:58:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:58:14.456575 543705 disk_worker.go:494] system disk:vda1
I0320 00:58:14.456605 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:58:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:58:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:58:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:58:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:58:16.472401 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:58:21.787572 543705 disk_info.go:125] begin check local disk info of client
I0320 00:58:21.790020 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:58:21.790026 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004c5740 0xc0004c5780]
E0320 00:58:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:58:23.409770 543705 memory.go:184] no items to output this cycle
I0320 00:58:23.409807 543705 cpu.go:275] no items to output this cycle
E0320 00:58:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:58:33.409798 543705 memory.go:184] no items to output this cycle
I0320 00:58:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 00:58:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:58:43.409816 543705 memory.go:191] Add success.
I0320 00:58:43.409826 543705 cpu.go:282] Add success.
I0320 00:58:43.419888 543705 net.go:648] Add success.
I0320 00:58:43.422717 543705 net.go:770] primary dev: ETH0
I0320 00:58:43.422730 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:58:43.422741 543705 net.go:698] Add success.
I0320 00:58:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:58:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:58:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:58:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:58:53.409798 543705 memory.go:184] no items to output this cycle
I0320 00:58:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 00:59:03.409851 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:59:03.409869 543705 memory.go:184] no items to output this cycle
I0320 00:59:03.409953 543705 cpu.go:275] no items to output this cycle
E0320 00:59:13.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:59:13.409827 543705 memory.go:191] Add success.
I0320 00:59:13.409838 543705 cpu.go:282] Add success.
W0320 00:59:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 00:59:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 00:59:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 00:59:13.420358 543705 net.go:648] Add success.
I0320 00:59:13.423213 543705 net.go:770] primary dev: ETH0
I0320 00:59:13.423226 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:59:13.423238 543705 net.go:698] Add success.
I0320 00:59:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 00:59:14.455205 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 00:59:14.455215 543705 disk_worker.go:708] disk space is not compliant
W0320 00:59:14.455217 543705 disk_worker.go:728] disk inode is not compliant
I0320 00:59:14.456596 543705 disk_worker.go:494] system disk:vda1
I0320 00:59:14.456628 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 00:59:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 00:59:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:59:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:59:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 00:59:16.472385 543705 disk_local_worker.go:436] Get disk info: []
I0320 00:59:21.791637 543705 disk_info.go:125] begin check local disk info of client
I0320 00:59:21.794135 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 00:59:21.794141 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aab80 0xc0001aabc0]
E0320 00:59:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:59:23.409769 543705 memory.go:184] no items to output this cycle
I0320 00:59:23.409780 543705 cpu.go:275] no items to output this cycle
E0320 00:59:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:59:33.409774 543705 memory.go:184] no items to output this cycle
I0320 00:59:33.409783 543705 cpu.go:275] no items to output this cycle
E0320 00:59:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:59:43.409790 543705 memory.go:191] Add success.
I0320 00:59:43.409792 543705 cpu.go:282] Add success.
I0320 00:59:43.419958 543705 net.go:648] Add success.
I0320 00:59:43.422890 543705 net.go:770] primary dev: ETH0
I0320 00:59:43.422903 543705 net.go:802] Send network stats successfully!,count is 6
I0320 00:59:43.422916 543705 net.go:698] Add success.
I0320 00:59:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 00:59:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 00:59:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 00:59:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 00:59:53.409781 543705 memory.go:184] no items to output this cycle
I0320 00:59:53.409783 543705 cpu.go:275] no items to output this cycle
E0320 01:00:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:00:03.409790 543705 memory.go:184] no items to output this cycle
I0320 01:00:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 01:00:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:00:13.409794 543705 memory.go:191] Add success.
I0320 01:00:13.409793 543705 cpu.go:282] Add success.
W0320 01:00:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:00:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:00:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:00:13.420217 543705 net.go:648] Add success.
I0320 01:00:13.423111 543705 net.go:770] primary dev: ETH0
I0320 01:00:13.423125 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:00:13.423138 543705 net.go:698] Add success.
I0320 01:00:13.463681 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"6d79f950-0394-4358-9d6e-b599bfafa392","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 01:00:13.463714 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 01:00:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:00:14.455151 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:00:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0320 01:00:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:00:14.456505 543705 disk_worker.go:494] system disk:vda1
I0320 01:00:14.456548 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:00:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:00:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:00:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:00:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:00:16.472406 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:00:21.795614 543705 disk_info.go:125] begin check local disk info of client
I0320 01:00:21.798062 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:00:21.798068 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007bd80 0xc00007bdc0]
E0320 01:00:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:00:23.409784 543705 memory.go:184] no items to output this cycle
I0320 01:00:23.409798 543705 cpu.go:275] no items to output this cycle
E0320 01:00:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:00:33.409800 543705 memory.go:184] no items to output this cycle
I0320 01:00:33.409805 543705 cpu.go:275] no items to output this cycle
I0320 01:00:38.009739 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 01:00:38.009745 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 01:00:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:00:43.410811 543705 memory.go:191] Add success.
I0320 01:00:43.409818 543705 cpu.go:282] Add success.
I0320 01:00:43.420485 543705 net.go:648] Add success.
I0320 01:00:43.423104 543705 net.go:770] primary dev: ETH0
I0320 01:00:43.423117 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:00:43.423131 543705 net.go:698] Add success.
I0320 01:00:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:00:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:00:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:00:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:00:53.409774 543705 memory.go:184] no items to output this cycle
I0320 01:00:53.409782 543705 cpu.go:275] no items to output this cycle
E0320 01:01:03.409837 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:01:03.409857 543705 memory.go:184] no items to output this cycle
I0320 01:01:03.409989 543705 cpu.go:275] no items to output this cycle
E0320 01:01:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:01:13.409799 543705 memory.go:191] Add success.
I0320 01:01:13.409815 543705 cpu.go:282] Add success.
W0320 01:01:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:01:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:01:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:01:13.420148 543705 net.go:648] Add success.
I0320 01:01:13.422968 543705 net.go:770] primary dev: ETH0
I0320 01:01:13.422980 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:01:13.422993 543705 net.go:698] Add success.
I0320 01:01:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:01:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:01:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 01:01:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:01:14.456593 543705 disk_worker.go:494] system disk:vda1
I0320 01:01:14.456624 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:01:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:01:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:01:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:01:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:01:16.472377 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:01:21.798154 543705 disk_info.go:125] begin check local disk info of client
I0320 01:01:21.800597 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:01:21.800604 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034d4c0 0xc00034d500]
E0320 01:01:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:01:23.409787 543705 memory.go:184] no items to output this cycle
I0320 01:01:23.409799 543705 cpu.go:275] no items to output this cycle
E0320 01:01:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:01:33.409795 543705 memory.go:184] no items to output this cycle
I0320 01:01:33.409809 543705 cpu.go:275] no items to output this cycle
E0320 01:01:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:01:43.409782 543705 memory.go:191] Add success.
I0320 01:01:43.409802 543705 cpu.go:282] Add success.
I0320 01:01:43.420002 543705 net.go:648] Add success.
I0320 01:01:43.422821 543705 net.go:770] primary dev: ETH0
I0320 01:01:43.422834 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:01:43.422845 543705 net.go:698] Add success.
I0320 01:01:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:01:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:01:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:01:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:01:53.409776 543705 memory.go:184] no items to output this cycle
I0320 01:01:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 01:02:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:02:03.409779 543705 memory.go:184] no items to output this cycle
I0320 01:02:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 01:02:13.409871 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:02:13.409902 543705 memory.go:191] Add success.
W0320 01:02:13.409967 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:02:13.409989 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:02:13.409992 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:02:13.409994 543705 cpu.go:282] Add success.
I0320 01:02:13.419713 543705 net.go:648] Add success.
I0320 01:02:13.422450 543705 net.go:770] primary dev: ETH0
I0320 01:02:13.422463 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:02:13.422474 543705 net.go:698] Add success.
W0320 01:02:14.455119 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:02:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 01:02:14.455189 543705 disk_worker.go:728] disk inode is not compliant
E0320 01:02:14.455907 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 01:02:14.455915 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 01:02:14.455921 543705 custom_config.go:64] query custom config with name: gpu
I0320 01:02:14.456654 543705 disk_worker.go:494] system disk:vda1
I0320 01:02:14.456697 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 01:02:15.456823 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 01:02:15.456832 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:02:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 01:02:16.457989 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 01:02:16.458025 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:02:16.458042 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:02:16.472370 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:02:21.801672 543705 disk_info.go:125] begin check local disk info of client
I0320 01:02:21.804096 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:02:21.804102 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329380 0xc0003293c0]
E0320 01:02:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:02:23.409769 543705 memory.go:184] no items to output this cycle
I0320 01:02:23.409777 543705 cpu.go:275] no items to output this cycle
E0320 01:02:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:02:33.409772 543705 memory.go:184] no items to output this cycle
I0320 01:02:33.409780 543705 cpu.go:275] no items to output this cycle
E0320 01:02:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:02:43.409791 543705 memory.go:191] Add success.
I0320 01:02:43.409791 543705 cpu.go:282] Add success.
I0320 01:02:43.419848 543705 net.go:648] Add success.
I0320 01:02:43.422459 543705 net.go:770] primary dev: ETH0
I0320 01:02:43.422472 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:02:43.422484 543705 net.go:698] Add success.
I0320 01:02:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:02:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:02:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:02:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:02:53.409776 543705 memory.go:184] no items to output this cycle
I0320 01:02:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 01:03:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:03:03.409795 543705 memory.go:184] no items to output this cycle
I0320 01:03:03.409837 543705 cpu.go:275] no items to output this cycle
E0320 01:03:13.409884 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:03:13.409933 543705 cpu.go:282] Add success.
I0320 01:03:13.409934 543705 memory.go:191] Add success.
W0320 01:03:13.409973 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:03:13.409992 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:03:13.409996 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:03:13.419766 543705 net.go:648] Add success.
I0320 01:03:13.422597 543705 net.go:770] primary dev: ETH0
I0320 01:03:13.422613 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:03:13.422626 543705 net.go:698] Add success.
I0320 01:03:13.464105 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5a238178-3a3c-4fa8-98e8-6804248a142b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 01:03:13.464136 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 01:03:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:03:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:03:14.455224 543705 disk_worker.go:708] disk space is not compliant
W0320 01:03:14.455227 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:03:14.456792 543705 disk_worker.go:494] system disk:vda1
I0320 01:03:14.456828 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:03:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:03:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:03:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:03:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:03:16.472397 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:03:21.805673 543705 disk_info.go:125] begin check local disk info of client
I0320 01:03:21.808081 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:03:21.808087 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032f000 0xc00032f040]
E0320 01:03:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:03:23.409802 543705 memory.go:184] no items to output this cycle
I0320 01:03:23.409809 543705 cpu.go:275] no items to output this cycle
E0320 01:03:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:03:33.409770 543705 memory.go:184] no items to output this cycle
I0320 01:03:33.409792 543705 cpu.go:275] no items to output this cycle
I0320 01:03:38.012748 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 01:03:38.012755 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 01:03:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:03:43.410595 543705 memory.go:191] Add success.
I0320 01:03:43.409789 543705 cpu.go:282] Add success.
I0320 01:03:43.420274 543705 net.go:648] Add success.
I0320 01:03:43.422943 543705 net.go:770] primary dev: ETH0
I0320 01:03:43.422956 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:03:43.422968 543705 net.go:698] Add success.
I0320 01:03:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:03:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:03:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:03:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:03:53.409784 543705 memory.go:184] no items to output this cycle
I0320 01:03:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 01:04:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:04:03.409769 543705 memory.go:184] no items to output this cycle
I0320 01:04:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 01:04:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:04:13.409788 543705 memory.go:191] Add success.
I0320 01:04:13.409805 543705 cpu.go:282] Add success.
W0320 01:04:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:04:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:04:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:04:13.420245 543705 net.go:648] Add success.
I0320 01:04:13.422967 543705 net.go:770] primary dev: ETH0
I0320 01:04:13.422982 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:04:13.422996 543705 net.go:698] Add success.
I0320 01:04:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:04:14.455136 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:04:14.455147 543705 disk_worker.go:708] disk space is not compliant
W0320 01:04:14.455150 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:04:14.456471 543705 disk_worker.go:494] system disk:vda1
I0320 01:04:14.456513 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:04:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:04:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:04:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:04:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:04:16.472370 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:04:21.809671 543705 disk_info.go:125] begin check local disk info of client
I0320 01:04:21.812088 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:04:21.812094 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034dd40 0xc00034dd80]
E0320 01:04:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:04:23.409782 543705 memory.go:184] no items to output this cycle
I0320 01:04:23.409794 543705 cpu.go:275] no items to output this cycle
E0320 01:04:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:04:33.409778 543705 memory.go:184] no items to output this cycle
I0320 01:04:33.409780 543705 cpu.go:275] no items to output this cycle
E0320 01:04:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:04:43.409784 543705 memory.go:191] Add success.
I0320 01:04:43.409808 543705 cpu.go:282] Add success.
I0320 01:04:43.419994 543705 net.go:648] Add success.
I0320 01:04:43.422991 543705 net.go:770] primary dev: ETH0
I0320 01:04:43.423004 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:04:43.423015 543705 net.go:698] Add success.
I0320 01:04:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:04:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:04:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:04:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:04:53.409794 543705 cpu.go:275] no items to output this cycle
I0320 01:04:53.409796 543705 memory.go:184] no items to output this cycle
E0320 01:05:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:05:03.409781 543705 memory.go:184] no items to output this cycle
I0320 01:05:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 01:05:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:05:13.409833 543705 memory.go:191] Add success.
I0320 01:05:13.409839 543705 cpu.go:282] Add success.
W0320 01:05:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:05:13.409884 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:05:13.409888 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:05:13.420184 543705 net.go:648] Add success.
I0320 01:05:13.422937 543705 net.go:770] primary dev: ETH0
I0320 01:05:13.422949 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:05:13.422962 543705 net.go:698] Add success.
I0320 01:05:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:05:14.455197 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:05:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0320 01:05:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:05:14.456573 543705 disk_worker.go:494] system disk:vda1
I0320 01:05:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:05:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:05:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:05:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:05:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:05:16.472410 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:05:21.813690 543705 disk_info.go:125] begin check local disk info of client
I0320 01:05:21.816107 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:05:21.816113 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034cac0 0xc00034cb00]
E0320 01:05:23.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:05:23.409755 543705 memory.go:184] no items to output this cycle
I0320 01:05:23.409790 543705 cpu.go:275] no items to output this cycle
E0320 01:05:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:05:33.409789 543705 memory.go:184] no items to output this cycle
I0320 01:05:33.409794 543705 cpu.go:275] no items to output this cycle
E0320 01:05:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:05:43.409792 543705 memory.go:191] Add success.
I0320 01:05:43.409793 543705 cpu.go:282] Add success.
I0320 01:05:43.419876 543705 net.go:648] Add success.
I0320 01:05:43.422666 543705 net.go:770] primary dev: ETH0
I0320 01:05:43.422689 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:05:43.422701 543705 net.go:698] Add success.
I0320 01:05:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:05:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:05:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:05:53.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:05:53.409812 543705 memory.go:184] no items to output this cycle
I0320 01:05:53.409820 543705 cpu.go:275] no items to output this cycle
E0320 01:06:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:06:03.409788 543705 memory.go:184] no items to output this cycle
I0320 01:06:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 01:06:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:06:13.409804 543705 memory.go:191] Add success.
I0320 01:06:13.409821 543705 cpu.go:282] Add success.
W0320 01:06:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:06:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:06:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:06:13.420172 543705 net.go:648] Add success.
I0320 01:06:13.422902 543705 net.go:770] primary dev: ETH0
I0320 01:06:13.422915 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:06:13.422927 543705 net.go:698] Add success.
I0320 01:06:13.463304 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5d823a03-da05-4f29-8ef6-22bdbdf72998","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 01:06:13.463337 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 01:06:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:06:14.455154 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:06:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0320 01:06:14.455167 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:06:14.456506 543705 disk_worker.go:494] system disk:vda1
I0320 01:06:14.456534 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:06:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:06:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:06:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:06:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:06:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:06:21.817675 543705 disk_info.go:125] begin check local disk info of client
I0320 01:06:21.820100 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:06:21.820106 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004d5540 0xc0004d5580]
E0320 01:06:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:06:23.409802 543705 memory.go:184] no items to output this cycle
I0320 01:06:23.409815 543705 cpu.go:275] no items to output this cycle
E0320 01:06:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:06:33.409793 543705 memory.go:184] no items to output this cycle
I0320 01:06:33.409794 543705 cpu.go:275] no items to output this cycle
I0320 01:06:38.013729 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 01:06:38.013736 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 01:06:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:06:43.410568 543705 memory.go:191] Add success.
I0320 01:06:43.409798 543705 cpu.go:282] Add success.
I0320 01:06:43.420339 543705 net.go:648] Add success.
I0320 01:06:43.422850 543705 net.go:770] primary dev: ETH0
I0320 01:06:43.422864 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:06:43.422877 543705 net.go:698] Add success.
I0320 01:06:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:06:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:06:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:06:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:06:53.409802 543705 memory.go:184] no items to output this cycle
I0320 01:06:53.409822 543705 cpu.go:275] no items to output this cycle
E0320 01:07:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:07:03.409797 543705 memory.go:184] no items to output this cycle
I0320 01:07:03.409806 543705 cpu.go:275] no items to output this cycle
E0320 01:07:13.409863 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:07:13.409898 543705 memory.go:191] Add success.
W0320 01:07:13.409926 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:07:13.409938 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:07:13.409941 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:07:13.410010 543705 cpu.go:282] Add success.
I0320 01:07:13.419715 543705 net.go:648] Add success.
I0320 01:07:13.422562 543705 net.go:770] primary dev: ETH0
I0320 01:07:13.422591 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:07:13.422605 543705 net.go:698] Add success.
I0320 01:07:13.453137 543705 event_worker.go:152] Polling the log file for events...
W0320 01:07:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:07:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 01:07:14.455200 543705 disk_worker.go:728] disk inode is not compliant
E0320 01:07:14.455901 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 01:07:14.455910 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 01:07:14.455916 543705 custom_config.go:64] query custom config with name: gpu
I0320 01:07:14.456563 543705 disk_worker.go:494] system disk:vda1
I0320 01:07:14.456593 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 01:07:15.456783 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 01:07:15.456792 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:07:16.457914 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 01:07:16.457913 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 01:07:16.457966 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:07:16.457985 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:07:16.472312 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:07:21.821681 543705 disk_info.go:125] begin check local disk info of client
I0320 01:07:21.824056 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:07:21.824062 543705 disk_info.go:196] parse disk info done, disk is : [0xc000536f00 0xc000536f40]
E0320 01:07:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:07:23.409782 543705 memory.go:184] no items to output this cycle
I0320 01:07:23.409795 543705 cpu.go:275] no items to output this cycle
E0320 01:07:33.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:07:33.409762 543705 memory.go:184] no items to output this cycle
I0320 01:07:33.409802 543705 cpu.go:275] no items to output this cycle
E0320 01:07:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:07:43.409811 543705 memory.go:191] Add success.
I0320 01:07:43.409819 543705 cpu.go:282] Add success.
I0320 01:07:43.419974 543705 net.go:648] Add success.
I0320 01:07:43.422773 543705 net.go:770] primary dev: ETH0
I0320 01:07:43.422787 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:07:43.422799 543705 net.go:698] Add success.
I0320 01:07:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:07:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:07:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:07:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:07:53.409803 543705 memory.go:184] no items to output this cycle
I0320 01:07:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 01:08:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:08:03.409777 543705 memory.go:184] no items to output this cycle
I0320 01:08:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 01:08:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:08:13.409820 543705 memory.go:191] Add success.
I0320 01:08:13.409828 543705 cpu.go:282] Add success.
W0320 01:08:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:08:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:08:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:08:13.420369 543705 net.go:648] Add success.
I0320 01:08:13.423044 543705 net.go:770] primary dev: ETH0
I0320 01:08:13.423059 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:08:13.423074 543705 net.go:698] Add success.
I0320 01:08:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:08:14.455180 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:08:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0320 01:08:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:08:14.456569 543705 disk_worker.go:494] system disk:vda1
I0320 01:08:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:08:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:08:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:08:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:08:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:08:16.472392 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:08:21.825674 543705 disk_info.go:125] begin check local disk info of client
I0320 01:08:21.828053 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:08:21.828059 543705 disk_info.go:196] parse disk info done, disk is : [0xc00033bd00 0xc00033bd40]
E0320 01:08:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:08:23.409799 543705 memory.go:184] no items to output this cycle
I0320 01:08:23.409812 543705 cpu.go:275] no items to output this cycle
E0320 01:08:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:08:33.409775 543705 memory.go:184] no items to output this cycle
I0320 01:08:33.409784 543705 cpu.go:275] no items to output this cycle
E0320 01:08:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:08:43.409793 543705 memory.go:191] Add success.
I0320 01:08:43.409795 543705 cpu.go:282] Add success.
I0320 01:08:43.419984 543705 net.go:648] Add success.
I0320 01:08:43.422617 543705 net.go:770] primary dev: ETH0
I0320 01:08:43.422630 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:08:43.422645 543705 net.go:698] Add success.
I0320 01:08:46.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:08:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:08:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:08:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:08:53.409784 543705 memory.go:184] no items to output this cycle
I0320 01:08:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 01:09:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:09:03.409764 543705 memory.go:184] no items to output this cycle
I0320 01:09:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 01:09:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:09:13.409825 543705 memory.go:191] Add success.
I0320 01:09:13.409831 543705 cpu.go:282] Add success.
W0320 01:09:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:09:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:09:13.409878 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:09:13.420144 543705 net.go:648] Add success.
I0320 01:09:13.423063 543705 net.go:770] primary dev: ETH0
I0320 01:09:13.423077 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:09:13.423092 543705 net.go:698] Add success.
I0320 01:09:13.464306 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"edc0394f-094d-42c6-9ffd-164162f9c867","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 01:09:13.464339 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 01:09:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:09:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:09:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 01:09:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:09:14.456607 543705 disk_worker.go:494] system disk:vda1
I0320 01:09:14.456636 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:09:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:09:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:09:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:09:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:09:16.472381 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:09:21.829673 543705 disk_info.go:125] begin check local disk info of client
I0320 01:09:21.832152 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:09:21.832159 543705 disk_info.go:196] parse disk info done, disk is : [0xc000323180 0xc0003231c0]
E0320 01:09:23.409740 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:09:23.409754 543705 memory.go:184] no items to output this cycle
I0320 01:09:23.409801 543705 cpu.go:275] no items to output this cycle
E0320 01:09:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:09:33.409796 543705 memory.go:184] no items to output this cycle
I0320 01:09:33.409810 543705 cpu.go:275] no items to output this cycle
I0320 01:09:38.013873 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 01:09:38.013879 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 01:09:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:09:43.410720 543705 memory.go:191] Add success.
I0320 01:09:43.409816 543705 cpu.go:282] Add success.
I0320 01:09:43.420440 543705 net.go:648] Add success.
I0320 01:09:43.422994 543705 net.go:770] primary dev: ETH0
I0320 01:09:43.423007 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:09:43.423021 543705 net.go:698] Add success.
I0320 01:09:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:09:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:09:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:09:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:09:53.409777 543705 memory.go:184] no items to output this cycle
I0320 01:09:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 01:10:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:10:03.409765 543705 memory.go:184] no items to output this cycle
I0320 01:10:03.409894 543705 cpu.go:275] no items to output this cycle
E0320 01:10:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:10:13.409818 543705 memory.go:191] Add success.
I0320 01:10:13.409830 543705 cpu.go:282] Add success.
W0320 01:10:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:10:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:10:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:10:13.420208 543705 net.go:648] Add success.
I0320 01:10:13.422901 543705 net.go:770] primary dev: ETH0
I0320 01:10:13.422914 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:10:13.422927 543705 net.go:698] Add success.
I0320 01:10:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:10:14.455167 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:10:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 01:10:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:10:14.456581 543705 disk_worker.go:494] system disk:vda1
I0320 01:10:14.456610 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:10:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:10:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:10:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:10:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:10:16.472401 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:10:21.833673 543705 disk_info.go:125] begin check local disk info of client
I0320 01:10:21.836142 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:10:21.836148 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004c9540 0xc0004c9580]
E0320 01:10:23.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:10:23.409759 543705 memory.go:184] no items to output this cycle
I0320 01:10:23.409778 543705 cpu.go:275] no items to output this cycle
E0320 01:10:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:10:33.409767 543705 memory.go:184] no items to output this cycle
I0320 01:10:33.409806 543705 cpu.go:275] no items to output this cycle
E0320 01:10:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:10:43.409809 543705 memory.go:191] Add success.
I0320 01:10:43.409813 543705 cpu.go:282] Add success.
I0320 01:10:43.419838 543705 net.go:648] Add success.
I0320 01:10:43.422237 543705 net.go:770] primary dev: ETH0
I0320 01:10:43.422251 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:10:43.422263 543705 net.go:698] Add success.
I0320 01:10:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:10:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:10:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:10:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:10:53.409777 543705 memory.go:184] no items to output this cycle
I0320 01:10:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 01:11:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:11:03.409769 543705 memory.go:184] no items to output this cycle
I0320 01:11:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 01:11:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:11:13.409799 543705 memory.go:191] Add success.
W0320 01:11:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 01:11:13.409834 543705 cpu.go:282] Add success.
W0320 01:11:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:11:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:11:13.420163 543705 net.go:648] Add success.
I0320 01:11:13.422905 543705 net.go:770] primary dev: ETH0
I0320 01:11:13.422920 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:11:13.422935 543705 net.go:698] Add success.
I0320 01:11:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:11:14.455180 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:11:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 01:11:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:11:14.456606 543705 disk_worker.go:494] system disk:vda1
I0320 01:11:14.456636 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:11:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:11:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:11:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:11:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:11:16.472388 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:11:21.837671 543705 disk_info.go:125] begin check local disk info of client
I0320 01:11:21.840094 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:11:21.840101 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004c8b00 0xc0004c8b40]
E0320 01:11:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:11:23.409767 543705 memory.go:184] no items to output this cycle
I0320 01:11:23.409779 543705 cpu.go:275] no items to output this cycle
E0320 01:11:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:11:33.409794 543705 memory.go:184] no items to output this cycle
I0320 01:11:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 01:11:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:11:43.409783 543705 memory.go:191] Add success.
I0320 01:11:43.409783 543705 cpu.go:282] Add success.
I0320 01:11:43.420284 543705 net.go:648] Add success.
I0320 01:11:43.423081 543705 net.go:770] primary dev: ETH0
I0320 01:11:43.423094 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:11:43.423107 543705 net.go:698] Add success.
I0320 01:11:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:11:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:11:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:11:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:11:53.409806 543705 memory.go:184] no items to output this cycle
I0320 01:11:53.409816 543705 cpu.go:275] no items to output this cycle
E0320 01:12:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:12:03.409774 543705 memory.go:184] no items to output this cycle
I0320 01:12:03.409790 543705 cpu.go:275] no items to output this cycle
E0320 01:12:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:12:13.409871 543705 cpu.go:282] Add success.
I0320 01:12:13.409893 543705 memory.go:191] Add success.
W0320 01:12:13.409922 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:12:13.409936 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:12:13.409947 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:12:13.419761 543705 net.go:648] Add success.
I0320 01:12:13.422692 543705 net.go:770] primary dev: ETH0
I0320 01:12:13.422705 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:12:13.422723 543705 net.go:698] Add success.
I0320 01:12:13.631403 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"31e488d0-3b41-4f64-a1fc-971eb93c0a2e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 01:12:13.631435 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 01:12:14.454194 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:12:14.454206 543705 disk_worker.go:708] disk space is not compliant
W0320 01:12:14.454208 543705 disk_worker.go:728] disk inode is not compliant
E0320 01:12:14.455448 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 01:12:14.455468 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 01:12:14.455474 543705 custom_config.go:64] query custom config with name: gpu
I0320 01:12:14.456530 543705 disk_worker.go:494] system disk:vda1
I0320 01:12:14.456568 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 01:12:15.456840 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 01:12:15.456849 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:12:16.457949 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 01:12:16.457959 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 01:12:16.458000 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:12:16.458016 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:12:16.472331 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:12:21.841673 543705 disk_info.go:125] begin check local disk info of client
I0320 01:12:21.844056 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:12:21.844062 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003527c0 0xc000352800]
E0320 01:12:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:12:23.409782 543705 memory.go:184] no items to output this cycle
I0320 01:12:23.409795 543705 cpu.go:275] no items to output this cycle
E0320 01:12:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:12:33.409779 543705 cpu.go:275] no items to output this cycle
I0320 01:12:33.409783 543705 memory.go:184] no items to output this cycle
I0320 01:12:38.014018 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 01:12:38.014025 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 01:12:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:12:43.410687 543705 memory.go:191] Add success.
I0320 01:12:43.409797 543705 cpu.go:282] Add success.
I0320 01:12:43.420365 543705 net.go:648] Add success.
I0320 01:12:43.422992 543705 net.go:770] primary dev: ETH0
I0320 01:12:43.423006 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:12:43.423021 543705 net.go:698] Add success.
I0320 01:12:46.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:12:46.458065 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:12:46.458096 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:12:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:12:53.409788 543705 memory.go:184] no items to output this cycle
I0320 01:12:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 01:13:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:13:03.409773 543705 memory.go:184] no items to output this cycle
I0320 01:13:03.409779 543705 cpu.go:275] no items to output this cycle
E0320 01:13:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:13:13.409799 543705 memory.go:191] Add success.
I0320 01:13:13.409824 543705 cpu.go:282] Add success.
W0320 01:13:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:13:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:13:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:13:13.420163 543705 net.go:648] Add success.
I0320 01:13:13.422877 543705 net.go:770] primary dev: ETH0
I0320 01:13:13.422892 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:13:13.422906 543705 net.go:698] Add success.
I0320 01:13:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:13:14.455118 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:13:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 01:13:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:13:14.456577 543705 disk_worker.go:494] system disk:vda1
I0320 01:13:14.456608 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:13:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:13:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:13:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:13:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:13:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:13:21.845675 543705 disk_info.go:125] begin check local disk info of client
I0320 01:13:21.848142 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:13:21.848149 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007be80 0xc00007bec0]
I0320 01:13:23.409790 543705 cpu.go:275] no items to output this cycle
E0320 01:13:23.409818 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:13:23.409831 543705 memory.go:184] no items to output this cycle
E0320 01:13:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:13:33.409785 543705 memory.go:184] no items to output this cycle
I0320 01:13:33.409785 543705 cpu.go:275] no items to output this cycle
E0320 01:13:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:13:43.409817 543705 memory.go:191] Add success.
I0320 01:13:43.409817 543705 cpu.go:282] Add success.
I0320 01:13:43.419958 543705 net.go:648] Add success.
I0320 01:13:43.422706 543705 net.go:770] primary dev: ETH0
I0320 01:13:43.422719 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:13:43.422732 543705 net.go:698] Add success.
I0320 01:13:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:13:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:13:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:13:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:13:53.409777 543705 memory.go:184] no items to output this cycle
I0320 01:13:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 01:14:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:14:03.409778 543705 memory.go:184] no items to output this cycle
I0320 01:14:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 01:14:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:14:13.409788 543705 memory.go:191] Add success.
I0320 01:14:13.409789 543705 cpu.go:282] Add success.
W0320 01:14:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:14:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:14:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:14:13.419759 543705 net.go:648] Add success.
I0320 01:14:13.422477 543705 net.go:770] primary dev: ETH0
I0320 01:14:13.422492 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:14:13.422506 543705 net.go:698] Add success.
I0320 01:14:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:14:14.455196 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:14:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 01:14:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:14:14.456602 543705 disk_worker.go:494] system disk:vda1
I0320 01:14:14.456643 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:14:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:14:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:14:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:14:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:14:16.472379 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:14:21.849671 543705 disk_info.go:125] begin check local disk info of client
I0320 01:14:21.852147 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:14:21.852154 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034d580 0xc00034d5c0]
E0320 01:14:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:14:23.409771 543705 memory.go:184] no items to output this cycle
I0320 01:14:23.409771 543705 cpu.go:275] no items to output this cycle
E0320 01:14:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:14:33.409775 543705 cpu.go:275] no items to output this cycle
I0320 01:14:33.409784 543705 memory.go:184] no items to output this cycle
E0320 01:14:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:14:43.409820 543705 memory.go:191] Add success.
I0320 01:14:43.409826 543705 cpu.go:282] Add success.
I0320 01:14:43.419958 543705 net.go:648] Add success.
I0320 01:14:43.422731 543705 net.go:770] primary dev: ETH0
I0320 01:14:43.422744 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:14:43.422757 543705 net.go:698] Add success.
I0320 01:14:46.457999 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:14:46.458071 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:14:46.458098 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:14:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:14:53.409777 543705 memory.go:184] no items to output this cycle
I0320 01:14:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 01:15:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:15:03.409801 543705 memory.go:184] no items to output this cycle
I0320 01:15:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 01:15:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:15:13.409790 543705 memory.go:191] Add success.
W0320 01:15:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 01:15:13.409824 543705 cpu.go:282] Add success.
W0320 01:15:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:15:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:15:13.419722 543705 net.go:648] Add success.
I0320 01:15:13.422356 543705 net.go:770] primary dev: ETH0
I0320 01:15:13.422369 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:15:13.422379 543705 net.go:698] Add success.
I0320 01:15:13.468480 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2857b90e-a280-4791-b960-d8303e729b83","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 01:15:13.468520 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 01:15:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:15:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:15:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 01:15:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:15:14.456514 543705 disk_worker.go:494] system disk:vda1
I0320 01:15:14.456559 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:15:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:15:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:15:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:15:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:15:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:15:21.853673 543705 disk_info.go:125] begin check local disk info of client
I0320 01:15:21.856128 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:15:21.856134 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004bccc0 0xc0004bcd00]
E0320 01:15:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:15:23.409797 543705 memory.go:184] no items to output this cycle
I0320 01:15:23.409813 543705 cpu.go:275] no items to output this cycle
E0320 01:15:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:15:33.409781 543705 memory.go:184] no items to output this cycle
I0320 01:15:33.409798 543705 cpu.go:275] no items to output this cycle
I0320 01:15:38.016777 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 01:15:38.016783 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 01:15:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:15:43.410895 543705 memory.go:191] Add success.
I0320 01:15:43.409827 543705 cpu.go:282] Add success.
I0320 01:15:43.420607 543705 net.go:648] Add success.
I0320 01:15:43.423267 543705 net.go:770] primary dev: ETH0
I0320 01:15:43.423279 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:15:43.423294 543705 net.go:698] Add success.
I0320 01:15:46.458246 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:15:46.458320 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:15:46.458347 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:15:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:15:53.409777 543705 memory.go:184] no items to output this cycle
I0320 01:15:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 01:16:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:16:03.409803 543705 memory.go:184] no items to output this cycle
I0320 01:16:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 01:16:13.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:16:13.409837 543705 memory.go:191] Add success.
I0320 01:16:13.409843 543705 cpu.go:282] Add success.
W0320 01:16:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:16:13.409884 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:16:13.409888 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:16:13.420208 543705 net.go:648] Add success.
I0320 01:16:13.422877 543705 net.go:770] primary dev: ETH0
I0320 01:16:13.422890 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:16:13.422903 543705 net.go:698] Add success.
I0320 01:16:14.454978 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:16:14.455108 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:16:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 01:16:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:16:14.456556 543705 disk_worker.go:494] system disk:vda1
I0320 01:16:14.456587 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:16:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:16:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:16:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:16:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:16:16.472432 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:16:21.857674 543705 disk_info.go:125] begin check local disk info of client
I0320 01:16:21.860118 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:16:21.860124 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aae00 0xc0001aae40]
E0320 01:16:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:16:23.409792 543705 memory.go:184] no items to output this cycle
I0320 01:16:23.409804 543705 cpu.go:275] no items to output this cycle
E0320 01:16:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:16:33.409803 543705 memory.go:184] no items to output this cycle
I0320 01:16:33.409817 543705 cpu.go:275] no items to output this cycle
E0320 01:16:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:16:43.409777 543705 memory.go:191] Add success.
I0320 01:16:43.409800 543705 cpu.go:282] Add success.
I0320 01:16:43.419688 543705 net.go:770] primary dev: ETH0
I0320 01:16:43.419702 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:16:43.419718 543705 net.go:698] Add success.
I0320 01:16:43.420101 543705 net.go:648] Add success.
I0320 01:16:46.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:16:46.458065 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:16:46.458092 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:16:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:16:53.409770 543705 memory.go:184] no items to output this cycle
I0320 01:16:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 01:17:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:17:03.409799 543705 memory.go:184] no items to output this cycle
I0320 01:17:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 01:17:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:17:13.409794 543705 memory.go:191] Add success.
I0320 01:17:13.409811 543705 cpu.go:282] Add success.
W0320 01:17:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:17:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:17:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:17:13.420121 543705 net.go:648] Add success.
I0320 01:17:13.422950 543705 net.go:770] primary dev: ETH0
I0320 01:17:13.422963 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:17:13.422975 543705 net.go:698] Add success.
I0320 01:17:13.453538 543705 event_worker.go:152] Polling the log file for events...
W0320 01:17:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:17:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 01:17:14.455174 543705 disk_worker.go:728] disk inode is not compliant
E0320 01:17:14.455889 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 01:17:14.455898 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 01:17:14.455903 543705 custom_config.go:64] query custom config with name: gpu
I0320 01:17:14.456545 543705 disk_worker.go:494] system disk:vda1
I0320 01:17:14.456575 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 01:17:15.456859 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 01:17:15.456870 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:17:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 01:17:16.457976 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 01:17:16.458016 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:17:16.458034 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:17:16.472399 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:17:21.861671 543705 disk_info.go:125] begin check local disk info of client
I0320 01:17:21.864093 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:17:21.864099 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034c500 0xc00034c540]
E0320 01:17:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:17:23.409787 543705 memory.go:184] no items to output this cycle
I0320 01:17:23.409801 543705 cpu.go:275] no items to output this cycle
E0320 01:17:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:17:33.409767 543705 memory.go:184] no items to output this cycle
I0320 01:17:33.409790 543705 cpu.go:275] no items to output this cycle
E0320 01:17:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:17:43.409795 543705 memory.go:191] Add success.
I0320 01:17:43.409797 543705 cpu.go:282] Add success.
I0320 01:17:43.419858 543705 net.go:648] Add success.
I0320 01:17:43.422536 543705 net.go:770] primary dev: ETH0
I0320 01:17:43.422549 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:17:43.422562 543705 net.go:698] Add success.
I0320 01:17:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:17:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:17:46.458089 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:17:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:17:53.409770 543705 memory.go:184] no items to output this cycle
I0320 01:17:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 01:18:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:18:03.409809 543705 memory.go:184] no items to output this cycle
I0320 01:18:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 01:18:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:18:13.409880 543705 memory.go:191] Add success.
W0320 01:18:13.409911 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:18:13.409923 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:18:13.409928 543705 cpu.go:282] Add success.
I0320 01:18:13.409933 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:18:13.419748 543705 net.go:648] Add success.
I0320 01:18:13.422541 543705 net.go:770] primary dev: ETH0
I0320 01:18:13.422557 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:18:13.422570 543705 net.go:698] Add success.
I0320 01:18:13.908825 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"63ad3ce6-0fa2-4349-a7f4-fb9ce9574144","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 01:18:13.908867 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 01:18:14.454618 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:18:14.454848 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:18:14.454858 543705 disk_worker.go:708] disk space is not compliant
W0320 01:18:14.454861 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:18:14.456205 543705 disk_worker.go:494] system disk:vda1
I0320 01:18:14.456260 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:18:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:18:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:18:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:18:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:18:16.472379 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:18:21.865671 543705 disk_info.go:125] begin check local disk info of client
I0320 01:18:21.868076 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:18:21.868082 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007af80 0xc00007afc0]
I0320 01:18:23.409788 543705 cpu.go:275] no items to output this cycle
E0320 01:18:23.409812 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:18:23.409829 543705 memory.go:184] no items to output this cycle
E0320 01:18:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:18:33.409797 543705 cpu.go:275] no items to output this cycle
I0320 01:18:33.409804 543705 memory.go:184] no items to output this cycle
I0320 01:18:38.017748 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 01:18:38.017754 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 01:18:43.410410 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:18:43.411257 543705 memory.go:191] Add success.
I0320 01:18:43.410449 543705 cpu.go:282] Add success.
I0320 01:18:43.419944 543705 net.go:648] Add success.
I0320 01:18:43.422691 543705 net.go:770] primary dev: ETH0
I0320 01:18:43.422704 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:18:43.422716 543705 net.go:698] Add success.
I0320 01:18:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:18:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:18:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:18:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:18:53.409796 543705 memory.go:184] no items to output this cycle
I0320 01:18:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 01:19:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:19:03.409776 543705 memory.go:184] no items to output this cycle
I0320 01:19:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 01:19:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:19:13.409791 543705 memory.go:191] Add success.
I0320 01:19:13.409811 543705 cpu.go:282] Add success.
W0320 01:19:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:19:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:19:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:19:13.419718 543705 net.go:648] Add success.
I0320 01:19:13.422294 543705 net.go:770] primary dev: ETH0
I0320 01:19:13.422309 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:19:13.422323 543705 net.go:698] Add success.
I0320 01:19:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:19:14.455158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:19:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0320 01:19:14.455171 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:19:14.456529 543705 disk_worker.go:494] system disk:vda1
I0320 01:19:14.456557 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:19:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:19:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:19:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:19:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:19:16.472392 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:19:21.869671 543705 disk_info.go:125] begin check local disk info of client
I0320 01:19:21.872103 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:19:21.872109 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4300 0xc0000c4380]
E0320 01:19:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:19:23.409795 543705 memory.go:184] no items to output this cycle
I0320 01:19:23.409808 543705 cpu.go:275] no items to output this cycle
E0320 01:19:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:19:33.409802 543705 memory.go:184] no items to output this cycle
I0320 01:19:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 01:19:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:19:43.409794 543705 memory.go:191] Add success.
I0320 01:19:43.409814 543705 cpu.go:282] Add success.
I0320 01:19:43.420009 543705 net.go:648] Add success.
I0320 01:19:43.423505 543705 net.go:770] primary dev: ETH0
I0320 01:19:43.423518 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:19:43.423530 543705 net.go:698] Add success.
I0320 01:19:46.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:19:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:19:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:19:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:19:53.409771 543705 memory.go:184] no items to output this cycle
I0320 01:19:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 01:20:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:20:03.409804 543705 memory.go:184] no items to output this cycle
I0320 01:20:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 01:20:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:20:13.409824 543705 memory.go:191] Add success.
I0320 01:20:13.409833 543705 cpu.go:282] Add success.
W0320 01:20:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:20:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:20:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:20:13.420414 543705 net.go:648] Add success.
I0320 01:20:13.423201 543705 net.go:770] primary dev: ETH0
I0320 01:20:13.423216 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:20:13.423239 543705 net.go:698] Add success.
I0320 01:20:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:20:14.455108 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:20:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 01:20:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:20:14.456554 543705 disk_worker.go:494] system disk:vda1
I0320 01:20:14.456584 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:20:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:20:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:20:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:20:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:20:16.472401 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:20:21.873671 543705 disk_info.go:125] begin check local disk info of client
I0320 01:20:21.876114 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:20:21.876122 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a9f40 0xc00007a000]
E0320 01:20:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:20:23.409790 543705 memory.go:184] no items to output this cycle
I0320 01:20:23.409803 543705 cpu.go:275] no items to output this cycle
E0320 01:20:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:20:33.409783 543705 memory.go:184] no items to output this cycle
I0320 01:20:33.409785 543705 cpu.go:275] no items to output this cycle
E0320 01:20:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:20:43.409818 543705 memory.go:191] Add success.
I0320 01:20:43.409825 543705 cpu.go:282] Add success.
I0320 01:20:43.419971 543705 net.go:648] Add success.
I0320 01:20:43.422490 543705 net.go:770] primary dev: ETH0
I0320 01:20:43.422505 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:20:43.422520 543705 net.go:698] Add success.
I0320 01:20:46.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:20:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:20:46.458091 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:20:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:20:53.409767 543705 memory.go:184] no items to output this cycle
I0320 01:20:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 01:21:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:21:03.409769 543705 memory.go:184] no items to output this cycle
I0320 01:21:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 01:21:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:21:13.409821 543705 memory.go:191] Add success.
I0320 01:21:13.409822 543705 cpu.go:282] Add success.
W0320 01:21:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:21:13.409876 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:21:13.409881 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:21:13.420130 543705 net.go:648] Add success.
I0320 01:21:13.422912 543705 net.go:770] primary dev: ETH0
I0320 01:21:13.422925 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:21:13.422939 543705 net.go:698] Add success.
I0320 01:21:13.463016 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f06c1826-afbb-4184-ae47-4aaa4044fc63","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 01:21:13.463051 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 01:21:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:21:14.455098 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:21:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 01:21:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:21:14.456578 543705 disk_worker.go:494] system disk:vda1
I0320 01:21:14.456606 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:21:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:21:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:21:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:21:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:21:16.472366 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:21:21.877673 543705 disk_info.go:125] begin check local disk info of client
I0320 01:21:21.880088 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:21:21.880094 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003aea00 0xc0003aea40]
E0320 01:21:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:21:23.409758 543705 memory.go:184] no items to output this cycle
I0320 01:21:23.409795 543705 cpu.go:275] no items to output this cycle
E0320 01:21:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:21:33.409794 543705 memory.go:184] no items to output this cycle
I0320 01:21:33.409806 543705 cpu.go:275] no items to output this cycle
I0320 01:21:38.020796 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 01:21:38.020803 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 01:21:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:21:43.410705 543705 memory.go:191] Add success.
I0320 01:21:43.409805 543705 cpu.go:282] Add success.
I0320 01:21:43.420432 543705 net.go:648] Add success.
I0320 01:21:43.423185 543705 net.go:770] primary dev: ETH0
I0320 01:21:43.423198 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:21:43.423212 543705 net.go:698] Add success.
I0320 01:21:46.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:21:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:21:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:21:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:21:53.409782 543705 memory.go:184] no items to output this cycle
I0320 01:21:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 01:22:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:22:03.409773 543705 memory.go:184] no items to output this cycle
I0320 01:22:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 01:22:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:22:13.409809 543705 memory.go:191] Add success.
I0320 01:22:13.409820 543705 cpu.go:282] Add success.
W0320 01:22:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:22:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:22:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:22:13.420073 543705 net.go:648] Add success.
I0320 01:22:13.423030 543705 net.go:770] primary dev: ETH0
I0320 01:22:13.423043 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:22:13.423058 543705 net.go:698] Add success.
W0320 01:22:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:22:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0320 01:22:14.455176 543705 disk_worker.go:728] disk inode is not compliant
E0320 01:22:14.456786 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 01:22:14.456795 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 01:22:14.456800 543705 custom_config.go:64] query custom config with name: gpu
I0320 01:22:14.456844 543705 disk_worker.go:494] system disk:vda1
I0320 01:22:14.456883 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 01:22:15.456825 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 01:22:15.456835 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:22:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 01:22:16.457975 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 01:22:16.458017 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:22:16.458033 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:22:16.472373 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:22:21.881673 543705 disk_info.go:125] begin check local disk info of client
I0320 01:22:21.884071 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:22:21.884077 543705 disk_info.go:196] parse disk info done, disk is : [0xc000331540 0xc000331580]
E0320 01:22:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:22:23.409780 543705 memory.go:184] no items to output this cycle
I0320 01:22:23.409793 543705 cpu.go:275] no items to output this cycle
E0320 01:22:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:22:33.409799 543705 memory.go:184] no items to output this cycle
I0320 01:22:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 01:22:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:22:43.409785 543705 memory.go:191] Add success.
I0320 01:22:43.409806 543705 cpu.go:282] Add success.
I0320 01:22:43.419983 543705 net.go:648] Add success.
I0320 01:22:43.422641 543705 net.go:770] primary dev: ETH0
I0320 01:22:43.422653 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:22:43.422666 543705 net.go:698] Add success.
I0320 01:22:46.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:22:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:22:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:22:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:22:53.409770 543705 memory.go:184] no items to output this cycle
I0320 01:22:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 01:23:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:23:03.409768 543705 memory.go:184] no items to output this cycle
I0320 01:23:03.409790 543705 cpu.go:275] no items to output this cycle
E0320 01:23:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:23:13.409819 543705 memory.go:191] Add success.
I0320 01:23:13.409829 543705 cpu.go:282] Add success.
W0320 01:23:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:23:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:23:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:23:13.420377 543705 net.go:648] Add success.
I0320 01:23:13.423464 543705 net.go:770] primary dev: ETH0
I0320 01:23:13.423480 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:23:13.423494 543705 net.go:698] Add success.
I0320 01:23:14.453952 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:23:14.455223 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:23:14.455233 543705 disk_worker.go:708] disk space is not compliant
W0320 01:23:14.455236 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:23:14.456601 543705 disk_worker.go:494] system disk:vda1
I0320 01:23:14.456643 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:23:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:23:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:23:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:23:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:23:16.472353 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:23:21.885673 543705 disk_info.go:125] begin check local disk info of client
I0320 01:23:21.888040 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:23:21.888046 543705 disk_info.go:196] parse disk info done, disk is : [0xc000331ac0 0xc000331b00]
E0320 01:23:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:23:23.409780 543705 memory.go:184] no items to output this cycle
I0320 01:23:23.409794 543705 cpu.go:275] no items to output this cycle
E0320 01:23:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:23:33.409784 543705 memory.go:184] no items to output this cycle
I0320 01:23:33.409790 543705 cpu.go:275] no items to output this cycle
E0320 01:23:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:23:43.409816 543705 memory.go:191] Add success.
I0320 01:23:43.409827 543705 cpu.go:282] Add success.
I0320 01:23:43.419956 543705 net.go:648] Add success.
I0320 01:23:43.422669 543705 net.go:770] primary dev: ETH0
I0320 01:23:43.422685 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:23:43.422700 543705 net.go:698] Add success.
I0320 01:23:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:23:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:23:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:23:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:23:53.409785 543705 memory.go:184] no items to output this cycle
I0320 01:23:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 01:24:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:24:03.409785 543705 memory.go:184] no items to output this cycle
I0320 01:24:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 01:24:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:24:13.409792 543705 memory.go:191] Add success.
I0320 01:24:13.409812 543705 cpu.go:282] Add success.
W0320 01:24:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:24:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:24:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:24:13.420123 543705 net.go:648] Add success.
I0320 01:24:13.422944 543705 net.go:770] primary dev: ETH0
I0320 01:24:13.422957 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:24:13.422971 543705 net.go:698] Add success.
I0320 01:24:13.464278 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ad5638c6-4afa-4cb0-8e39-ee12548561d4","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 01:24:13.464311 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 01:24:14.454987 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:24:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:24:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 01:24:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:24:14.456680 543705 disk_worker.go:494] system disk:vda1
I0320 01:24:14.456710 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:24:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:24:16.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:24:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:24:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:24:16.472379 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:24:21.889680 543705 disk_info.go:125] begin check local disk info of client
I0320 01:24:21.892121 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:24:21.892128 543705 disk_info.go:196] parse disk info done, disk is : [0xc000331a80 0xc000331ac0]
E0320 01:24:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:24:23.409784 543705 memory.go:184] no items to output this cycle
I0320 01:24:23.409786 543705 cpu.go:275] no items to output this cycle
E0320 01:24:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:24:33.409789 543705 cpu.go:275] no items to output this cycle
I0320 01:24:33.409790 543705 memory.go:184] no items to output this cycle
I0320 01:24:38.021745 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 01:24:38.021752 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 01:24:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:24:43.410656 543705 memory.go:191] Add success.
I0320 01:24:43.409825 543705 cpu.go:282] Add success.
I0320 01:24:43.420367 543705 net.go:648] Add success.
I0320 01:24:43.422960 543705 net.go:770] primary dev: ETH0
I0320 01:24:43.422974 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:24:43.422986 543705 net.go:698] Add success.
I0320 01:24:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:24:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:24:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:24:53.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:24:53.409818 543705 memory.go:184] no items to output this cycle
I0320 01:24:53.409829 543705 cpu.go:275] no items to output this cycle
E0320 01:25:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:25:03.409788 543705 memory.go:184] no items to output this cycle
I0320 01:25:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 01:25:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:25:13.409805 543705 memory.go:191] Add success.
I0320 01:25:13.409811 543705 cpu.go:282] Add success.
W0320 01:25:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:25:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:25:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:25:13.420658 543705 net.go:648] Add success.
I0320 01:25:13.423579 543705 net.go:770] primary dev: ETH0
I0320 01:25:13.423591 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:25:13.423602 543705 net.go:698] Add success.
I0320 01:25:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:25:14.455105 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:25:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 01:25:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:25:14.457636 543705 disk_worker.go:494] system disk:vda1
I0320 01:25:14.457693 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:25:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:25:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:25:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:25:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:25:16.472380 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:25:21.893671 543705 disk_info.go:125] begin check local disk info of client
I0320 01:25:21.896133 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:25:21.896138 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034c080 0xc00034c0c0]
E0320 01:25:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:25:23.409772 543705 memory.go:184] no items to output this cycle
I0320 01:25:23.409792 543705 cpu.go:275] no items to output this cycle
E0320 01:25:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:25:33.409783 543705 memory.go:184] no items to output this cycle
I0320 01:25:33.409807 543705 cpu.go:275] no items to output this cycle
E0320 01:25:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:25:43.409815 543705 memory.go:191] Add success.
I0320 01:25:43.409821 543705 cpu.go:282] Add success.
I0320 01:25:43.419994 543705 net.go:648] Add success.
I0320 01:25:43.422895 543705 net.go:770] primary dev: ETH0
I0320 01:25:43.422911 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:25:43.422924 543705 net.go:698] Add success.
I0320 01:25:46.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:25:46.458062 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:25:46.458088 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:25:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:25:53.409793 543705 cpu.go:275] no items to output this cycle
I0320 01:25:53.409798 543705 memory.go:184] no items to output this cycle
E0320 01:26:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:26:03.409777 543705 memory.go:184] no items to output this cycle
I0320 01:26:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 01:26:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:26:13.409821 543705 memory.go:191] Add success.
I0320 01:26:13.409826 543705 cpu.go:282] Add success.
W0320 01:26:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:26:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:26:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:26:13.420133 543705 net.go:648] Add success.
I0320 01:26:13.422743 543705 net.go:770] primary dev: ETH0
I0320 01:26:13.422758 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:26:13.422780 543705 net.go:698] Add success.
I0320 01:26:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:26:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:26:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0320 01:26:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:26:14.456836 543705 disk_worker.go:494] system disk:vda1
I0320 01:26:14.456865 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:26:15.455950 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:26:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:26:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:26:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:26:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:26:21.897675 543705 disk_info.go:125] begin check local disk info of client
I0320 01:26:21.900127 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:26:21.900135 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047fe80 0xc00047fec0]
E0320 01:26:23.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:26:23.409756 543705 memory.go:184] no items to output this cycle
I0320 01:26:23.409807 543705 cpu.go:275] no items to output this cycle
E0320 01:26:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:26:33.409809 543705 memory.go:184] no items to output this cycle
I0320 01:26:33.409824 543705 cpu.go:275] no items to output this cycle
E0320 01:26:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:26:43.409780 543705 memory.go:191] Add success.
I0320 01:26:43.409817 543705 cpu.go:282] Add success.
I0320 01:26:43.419854 543705 net.go:648] Add success.
I0320 01:26:43.422592 543705 net.go:770] primary dev: ETH0
I0320 01:26:43.422609 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:26:43.422621 543705 net.go:698] Add success.
I0320 01:26:46.458012 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:26:46.458086 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:26:46.458115 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:26:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:26:53.409781 543705 memory.go:184] no items to output this cycle
I0320 01:26:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 01:27:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:27:03.409781 543705 memory.go:184] no items to output this cycle
I0320 01:27:03.409789 543705 cpu.go:275] no items to output this cycle
W0320 01:27:13.409705 543705 conf_downlod.go:84] cant't download conf: Get "": unsupported protocol scheme ""
W0320 01:27:13.409718 543705 conf_downlod.go:89] use old conf
E0320 01:27:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:27:13.409796 543705 memory.go:191] Add success.
I0320 01:27:13.409819 543705 cpu.go:282] Add success.
W0320 01:27:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:27:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:27:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:27:13.420111 543705 net.go:648] Add success.
I0320 01:27:13.423116 543705 net.go:770] primary dev: ETH0
I0320 01:27:13.423130 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:27:13.423144 543705 net.go:698] Add success.
I0320 01:27:13.429802 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 01:27:13.453138 543705 event_worker.go:152] Polling the log file for events...
I0320 01:27:13.468491 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"fe5c1129-a20c-46d1-8c07-df70a2b9f271","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 01:27:13.468526 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 01:27:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:27:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 01:27:14.455173 543705 disk_worker.go:728] disk inode is not compliant
E0320 01:27:14.456778 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 01:27:14.456787 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 01:27:14.456794 543705 custom_config.go:64] query custom config with name: gpu
I0320 01:27:14.457790 543705 disk_worker.go:494] system disk:vda1
I0320 01:27:14.457847 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 01:27:15.456822 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 01:27:15.456838 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:27:16.458067 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 01:27:16.458072 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 01:27:16.458123 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:27:16.458142 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:27:16.472544 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:27:21.901672 543705 disk_info.go:125] begin check local disk info of client
I0320 01:27:21.904066 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:27:21.904072 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002743c0 0xc000274400]
E0320 01:27:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:27:23.409782 543705 memory.go:184] no items to output this cycle
I0320 01:27:23.409798 543705 cpu.go:275] no items to output this cycle
E0320 01:27:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:27:33.409785 543705 memory.go:184] no items to output this cycle
I0320 01:27:33.409789 543705 cpu.go:275] no items to output this cycle
I0320 01:27:38.024762 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 01:27:38.024768 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 01:27:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:27:43.410674 543705 memory.go:191] Add success.
I0320 01:27:43.409785 543705 cpu.go:282] Add success.
I0320 01:27:43.420422 543705 net.go:648] Add success.
I0320 01:27:43.423078 543705 net.go:770] primary dev: ETH0
I0320 01:27:43.423092 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:27:43.423107 543705 net.go:698] Add success.
I0320 01:27:46.457993 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:27:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:27:46.458091 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:27:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:27:53.409785 543705 memory.go:184] no items to output this cycle
I0320 01:27:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 01:28:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:28:03.409767 543705 memory.go:184] no items to output this cycle
I0320 01:28:03.409799 543705 cpu.go:275] no items to output this cycle
W0320 01:28:13.409714 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:28:13.409737 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:28:13.409743 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:28:13.409828 543705 cpu.go:282] Add success.
E0320 01:28:13.409840 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:28:13.409854 543705 memory.go:191] Add success.
I0320 01:28:13.419868 543705 net.go:770] primary dev: ETH0
I0320 01:28:13.419894 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:28:13.419910 543705 net.go:698] Add success.
I0320 01:28:13.420286 543705 net.go:648] Add success.
I0320 01:28:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:28:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:28:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 01:28:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:28:14.456577 543705 disk_worker.go:494] system disk:vda1
I0320 01:28:14.456612 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:28:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:28:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:28:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:28:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:28:16.472399 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:28:21.905672 543705 disk_info.go:125] begin check local disk info of client
I0320 01:28:21.908149 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:28:21.908155 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a2b40 0xc0004a2b80]
E0320 01:28:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:28:23.409777 543705 cpu.go:275] no items to output this cycle
I0320 01:28:23.409783 543705 memory.go:184] no items to output this cycle
E0320 01:28:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:28:33.409785 543705 memory.go:184] no items to output this cycle
I0320 01:28:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 01:28:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:28:43.409791 543705 memory.go:191] Add success.
I0320 01:28:43.409791 543705 cpu.go:282] Add success.
I0320 01:28:43.420005 543705 net.go:648] Add success.
I0320 01:28:43.423028 543705 net.go:770] primary dev: ETH0
I0320 01:28:43.423042 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:28:43.423056 543705 net.go:698] Add success.
I0320 01:28:46.458005 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:28:46.458075 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:28:46.458104 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:28:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:28:53.409793 543705 memory.go:184] no items to output this cycle
I0320 01:28:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 01:29:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:29:03.409778 543705 memory.go:184] no items to output this cycle
I0320 01:29:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 01:29:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:29:13.409796 543705 memory.go:191] Add success.
I0320 01:29:13.409797 543705 cpu.go:282] Add success.
W0320 01:29:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:29:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:29:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:29:13.420096 543705 net.go:648] Add success.
I0320 01:29:13.422687 543705 net.go:770] primary dev: ETH0
I0320 01:29:13.422700 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:29:13.422716 543705 net.go:698] Add success.
I0320 01:29:14.454980 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:29:14.455158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:29:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0320 01:29:14.455171 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:29:14.456503 543705 disk_worker.go:494] system disk:vda1
I0320 01:29:14.456549 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:29:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:29:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:29:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:29:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:29:16.472394 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:29:21.909676 543705 disk_info.go:125] begin check local disk info of client
I0320 01:29:21.912117 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:29:21.912124 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004fc080 0xc0004fc0c0]
E0320 01:29:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:29:23.409785 543705 memory.go:184] no items to output this cycle
I0320 01:29:23.409797 543705 cpu.go:275] no items to output this cycle
E0320 01:29:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:29:33.409803 543705 memory.go:184] no items to output this cycle
I0320 01:29:33.409814 543705 cpu.go:275] no items to output this cycle
E0320 01:29:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:29:43.409809 543705 memory.go:191] Add success.
I0320 01:29:43.409820 543705 cpu.go:282] Add success.
I0320 01:29:43.419879 543705 net.go:648] Add success.
I0320 01:29:43.422561 543705 net.go:770] primary dev: ETH0
I0320 01:29:43.422576 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:29:43.422590 543705 net.go:698] Add success.
I0320 01:29:46.457994 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:29:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:29:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:29:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:29:53.409782 543705 memory.go:184] no items to output this cycle
I0320 01:29:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 01:30:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:30:03.409802 543705 memory.go:184] no items to output this cycle
I0320 01:30:03.409815 543705 cpu.go:275] no items to output this cycle
E0320 01:30:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:30:13.409821 543705 memory.go:191] Add success.
I0320 01:30:13.409827 543705 cpu.go:282] Add success.
W0320 01:30:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:30:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:30:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:30:13.420191 543705 net.go:648] Add success.
I0320 01:30:13.422939 543705 net.go:770] primary dev: ETH0
I0320 01:30:13.422951 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:30:13.422963 543705 net.go:698] Add success.
I0320 01:30:13.470288 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"6c9906e3-787b-472c-88e4-db5ab018bc19","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 01:30:13.470322 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 01:30:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:30:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:30:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 01:30:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:30:14.456582 543705 disk_worker.go:494] system disk:vda1
I0320 01:30:14.456611 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:30:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:30:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:30:16.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:30:16.458077 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:30:16.472094 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:30:21.913681 543705 disk_info.go:125] begin check local disk info of client
I0320 01:30:21.916317 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:30:21.916325 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034c000 0xc00034c040]
E0320 01:30:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:30:23.409767 543705 memory.go:184] no items to output this cycle
I0320 01:30:23.409794 543705 cpu.go:275] no items to output this cycle
E0320 01:30:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:30:33.409789 543705 memory.go:184] no items to output this cycle
I0320 01:30:33.409805 543705 cpu.go:275] no items to output this cycle
I0320 01:30:38.025739 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 01:30:38.025745 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 01:30:43.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:30:43.410848 543705 memory.go:191] Add success.
I0320 01:30:43.409807 543705 cpu.go:282] Add success.
I0320 01:30:43.420592 543705 net.go:648] Add success.
I0320 01:30:43.423622 543705 net.go:770] primary dev: ETH0
I0320 01:30:43.423637 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:30:43.423651 543705 net.go:698] Add success.
I0320 01:30:46.457994 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:30:46.458067 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:30:46.458095 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:30:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:30:53.409776 543705 memory.go:184] no items to output this cycle
I0320 01:30:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 01:31:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:31:03.409777 543705 memory.go:184] no items to output this cycle
I0320 01:31:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 01:31:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:31:13.409786 543705 memory.go:191] Add success.
W0320 01:31:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 01:31:13.409819 543705 cpu.go:282] Add success.
W0320 01:31:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:31:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:31:13.420283 543705 net.go:648] Add success.
I0320 01:31:13.422929 543705 net.go:770] primary dev: ETH0
I0320 01:31:13.422942 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:31:13.422955 543705 net.go:698] Add success.
I0320 01:31:14.454983 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:31:14.455223 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:31:14.455234 543705 disk_worker.go:708] disk space is not compliant
W0320 01:31:14.455237 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:31:14.456864 543705 disk_worker.go:494] system disk:vda1
I0320 01:31:14.456909 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:31:15.455985 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:31:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:31:16.458075 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:31:16.458109 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:31:16.472512 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:31:21.917666 543705 disk_info.go:125] begin check local disk info of client
I0320 01:31:21.920173 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:31:21.920179 543705 disk_info.go:196] parse disk info done, disk is : [0xc000381040 0xc000381080]
E0320 01:31:23.410252 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:31:23.410271 543705 memory.go:184] no items to output this cycle
I0320 01:31:23.410284 543705 cpu.go:275] no items to output this cycle
E0320 01:31:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:31:33.409784 543705 memory.go:184] no items to output this cycle
I0320 01:31:33.409804 543705 cpu.go:275] no items to output this cycle
E0320 01:31:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:31:43.409797 543705 memory.go:191] Add success.
I0320 01:31:43.409800 543705 cpu.go:282] Add success.
I0320 01:31:43.419887 543705 net.go:648] Add success.
I0320 01:31:43.422679 543705 net.go:770] primary dev: ETH0
I0320 01:31:43.422694 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:31:43.422708 543705 net.go:698] Add success.
I0320 01:31:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:31:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:31:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:31:53.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:31:53.409815 543705 memory.go:184] no items to output this cycle
I0320 01:31:53.409825 543705 cpu.go:275] no items to output this cycle
E0320 01:32:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:32:03.409797 543705 memory.go:184] no items to output this cycle
I0320 01:32:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 01:32:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:32:13.409820 543705 memory.go:191] Add success.
I0320 01:32:13.409824 543705 cpu.go:282] Add success.
W0320 01:32:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:32:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:32:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:32:13.420204 543705 net.go:648] Add success.
I0320 01:32:13.423328 543705 net.go:770] primary dev: ETH0
I0320 01:32:13.423343 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:32:13.423358 543705 net.go:698] Add success.
W0320 01:32:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:32:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 01:32:14.455187 543705 disk_worker.go:728] disk inode is not compliant
E0320 01:32:14.455827 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 01:32:14.455837 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 01:32:14.455843 543705 custom_config.go:64] query custom config with name: gpu
I0320 01:32:14.456563 543705 disk_worker.go:494] system disk:vda1
I0320 01:32:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 01:32:15.456836 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 01:32:15.456844 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:32:16.457938 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 01:32:16.457938 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 01:32:16.457993 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:32:16.458013 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:32:16.472353 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:32:21.921674 543705 disk_info.go:125] begin check local disk info of client
I0320 01:32:21.924037 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:32:21.924043 543705 disk_info.go:196] parse disk info done, disk is : [0xc000386ac0 0xc000386b00]
E0320 01:32:23.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:32:23.409804 543705 memory.go:184] no items to output this cycle
I0320 01:32:23.409815 543705 cpu.go:275] no items to output this cycle
E0320 01:32:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:32:33.409896 543705 cpu.go:275] no items to output this cycle
I0320 01:32:33.409933 543705 memory.go:184] no items to output this cycle
E0320 01:32:43.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:32:43.409771 543705 memory.go:191] Add success.
I0320 01:32:43.409804 543705 cpu.go:282] Add success.
I0320 01:32:43.419977 543705 net.go:648] Add success.
I0320 01:32:43.422865 543705 net.go:770] primary dev: ETH0
I0320 01:32:43.422878 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:32:43.422890 543705 net.go:698] Add success.
I0320 01:32:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:32:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:32:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:32:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:32:53.409776 543705 memory.go:184] no items to output this cycle
I0320 01:32:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 01:33:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:33:03.409800 543705 memory.go:184] no items to output this cycle
I0320 01:33:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 01:33:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:33:13.409819 543705 memory.go:191] Add success.
I0320 01:33:13.409828 543705 cpu.go:282] Add success.
W0320 01:33:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:33:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:33:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:33:13.420297 543705 net.go:648] Add success.
I0320 01:33:13.423313 543705 net.go:770] primary dev: ETH0
I0320 01:33:13.423328 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:33:13.423341 543705 net.go:698] Add success.
I0320 01:33:13.598829 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b26ad963-0df9-416f-bc3a-9aca7815445d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 01:33:13.598863 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 01:33:14.455096 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:33:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:33:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 01:33:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:33:14.456780 543705 disk_worker.go:494] system disk:vda1
I0320 01:33:14.456822 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:33:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:33:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:33:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:33:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:33:16.472478 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:33:21.925673 543705 disk_info.go:125] begin check local disk info of client
I0320 01:33:21.928117 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:33:21.928125 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e9f40 0xc0001f4000]
E0320 01:33:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:33:23.409761 543705 memory.go:184] no items to output this cycle
I0320 01:33:23.409782 543705 cpu.go:275] no items to output this cycle
E0320 01:33:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:33:33.409785 543705 memory.go:184] no items to output this cycle
I0320 01:33:33.409788 543705 cpu.go:275] no items to output this cycle
I0320 01:33:38.025892 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 01:33:38.025898 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 01:33:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:33:43.410731 543705 memory.go:191] Add success.
I0320 01:33:43.409795 543705 cpu.go:282] Add success.
I0320 01:33:43.420424 543705 net.go:648] Add success.
I0320 01:33:43.423294 543705 net.go:770] primary dev: ETH0
I0320 01:33:43.423307 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:33:43.423319 543705 net.go:698] Add success.
I0320 01:33:46.457999 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:33:46.458066 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:33:46.458098 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:33:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:33:53.409785 543705 memory.go:184] no items to output this cycle
I0320 01:33:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 01:34:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:34:03.409798 543705 memory.go:184] no items to output this cycle
I0320 01:34:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 01:34:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:34:13.409781 543705 memory.go:191] Add success.
I0320 01:34:13.409804 543705 cpu.go:282] Add success.
W0320 01:34:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:34:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:34:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:34:13.420255 543705 net.go:648] Add success.
I0320 01:34:13.423279 543705 net.go:770] primary dev: ETH0
I0320 01:34:13.423294 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:34:13.423308 543705 net.go:698] Add success.
I0320 01:34:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:34:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:34:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 01:34:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:34:14.456515 543705 disk_worker.go:494] system disk:vda1
I0320 01:34:14.456558 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:34:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:34:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:34:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:34:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:34:16.472400 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:34:21.929675 543705 disk_info.go:125] begin check local disk info of client
I0320 01:34:21.932113 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:34:21.932119 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039e700 0xc00039e740]
E0320 01:34:23.410209 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:34:23.410227 543705 memory.go:184] no items to output this cycle
I0320 01:34:23.410255 543705 cpu.go:275] no items to output this cycle
E0320 01:34:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:34:33.409776 543705 memory.go:184] no items to output this cycle
I0320 01:34:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 01:34:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:34:43.409805 543705 memory.go:191] Add success.
I0320 01:34:43.409809 543705 cpu.go:282] Add success.
I0320 01:34:43.419982 543705 net.go:648] Add success.
I0320 01:34:43.422698 543705 net.go:770] primary dev: ETH0
I0320 01:34:43.422711 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:34:43.422722 543705 net.go:698] Add success.
I0320 01:34:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:34:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:34:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:34:53.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:34:53.409812 543705 memory.go:184] no items to output this cycle
I0320 01:34:53.409826 543705 cpu.go:275] no items to output this cycle
E0320 01:35:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:35:03.409778 543705 memory.go:184] no items to output this cycle
I0320 01:35:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 01:35:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:35:13.409826 543705 memory.go:191] Add success.
I0320 01:35:13.409827 543705 cpu.go:282] Add success.
W0320 01:35:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:35:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:35:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:35:13.420184 543705 net.go:648] Add success.
I0320 01:35:13.422941 543705 net.go:770] primary dev: ETH0
I0320 01:35:13.422956 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:35:13.422972 543705 net.go:698] Add success.
I0320 01:35:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:35:14.455108 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:35:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0320 01:35:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:35:14.456606 543705 disk_worker.go:494] system disk:vda1
I0320 01:35:14.456636 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:35:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:35:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:35:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:35:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:35:16.472356 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:35:21.933675 543705 disk_info.go:125] begin check local disk info of client
I0320 01:35:21.936073 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:35:21.936079 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034df00 0xc00034df40]
E0320 01:35:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:35:23.409789 543705 cpu.go:275] no items to output this cycle
I0320 01:35:23.409794 543705 memory.go:184] no items to output this cycle
E0320 01:35:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:35:33.409779 543705 memory.go:184] no items to output this cycle
I0320 01:35:33.409784 543705 cpu.go:275] no items to output this cycle
E0320 01:35:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:35:43.409780 543705 memory.go:191] Add success.
I0320 01:35:43.409798 543705 cpu.go:282] Add success.
I0320 01:35:43.419862 543705 net.go:648] Add success.
I0320 01:35:43.423111 543705 net.go:770] primary dev: ETH0
I0320 01:35:43.423124 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:35:43.423136 543705 net.go:698] Add success.
I0320 01:35:46.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:35:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:35:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:35:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:35:53.409776 543705 memory.go:184] no items to output this cycle
I0320 01:35:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 01:36:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:36:03.409782 543705 memory.go:184] no items to output this cycle
I0320 01:36:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 01:36:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:36:13.409791 543705 cpu.go:282] Add success.
I0320 01:36:13.409797 543705 memory.go:191] Add success.
W0320 01:36:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:36:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:36:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:36:13.420140 543705 net.go:648] Add success.
I0320 01:36:13.423199 543705 net.go:770] primary dev: ETH0
I0320 01:36:13.423212 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:36:13.423224 543705 net.go:698] Add success.
I0320 01:36:13.462438 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"342fdcc9-4a16-470b-9e0f-85a0a0d2e2da","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 01:36:13.462469 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 01:36:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:36:14.455096 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:36:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 01:36:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:36:14.456577 543705 disk_worker.go:494] system disk:vda1
I0320 01:36:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:36:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:36:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:36:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:36:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:36:16.472362 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:36:21.937671 543705 disk_info.go:125] begin check local disk info of client
I0320 01:36:21.940086 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:36:21.940092 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003291c0 0xc000329200]
E0320 01:36:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:36:23.409766 543705 memory.go:184] no items to output this cycle
I0320 01:36:23.409790 543705 cpu.go:275] no items to output this cycle
E0320 01:36:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:36:33.409786 543705 memory.go:184] no items to output this cycle
I0320 01:36:33.409789 543705 cpu.go:275] no items to output this cycle
I0320 01:36:38.026035 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 01:36:38.026041 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 01:36:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:36:43.410731 543705 memory.go:191] Add success.
I0320 01:36:43.409820 543705 cpu.go:282] Add success.
I0320 01:36:43.420485 543705 net.go:648] Add success.
I0320 01:36:43.423196 543705 net.go:770] primary dev: ETH0
I0320 01:36:43.423209 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:36:43.423221 543705 net.go:698] Add success.
I0320 01:36:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:36:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:36:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:36:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:36:53.409776 543705 memory.go:184] no items to output this cycle
I0320 01:36:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 01:37:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:37:03.409784 543705 memory.go:184] no items to output this cycle
I0320 01:37:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 01:37:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:37:13.409803 543705 memory.go:191] Add success.
I0320 01:37:13.409806 543705 cpu.go:282] Add success.
W0320 01:37:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:37:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:37:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:37:13.420096 543705 net.go:648] Add success.
I0320 01:37:13.422945 543705 net.go:770] primary dev: ETH0
I0320 01:37:13.422958 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:37:13.422970 543705 net.go:698] Add success.
I0320 01:37:13.453578 543705 event_worker.go:152] Polling the log file for events...
W0320 01:37:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:37:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0320 01:37:14.455183 543705 disk_worker.go:728] disk inode is not compliant
E0320 01:37:14.455927 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 01:37:14.455935 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 01:37:14.455941 543705 custom_config.go:64] query custom config with name: gpu
I0320 01:37:14.456557 543705 disk_worker.go:494] system disk:vda1
I0320 01:37:14.456587 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 01:37:15.456785 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 01:37:15.456794 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:37:16.457952 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 01:37:16.457951 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 01:37:16.458005 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:37:16.458027 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:37:16.472343 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:37:21.941680 543705 disk_info.go:125] begin check local disk info of client
I0320 01:37:21.944100 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:37:21.944107 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007afc0 0xc00007b000]
E0320 01:37:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:37:23.409782 543705 memory.go:184] no items to output this cycle
I0320 01:37:23.409797 543705 cpu.go:275] no items to output this cycle
E0320 01:37:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:37:33.409794 543705 memory.go:184] no items to output this cycle
I0320 01:37:33.409807 543705 cpu.go:275] no items to output this cycle
E0320 01:37:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:37:43.409776 543705 memory.go:191] Add success.
I0320 01:37:43.409797 543705 cpu.go:282] Add success.
I0320 01:37:43.419937 543705 net.go:648] Add success.
I0320 01:37:43.422662 543705 net.go:770] primary dev: ETH0
I0320 01:37:43.422674 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:37:43.422686 543705 net.go:698] Add success.
I0320 01:37:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:37:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:37:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:37:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:37:53.409805 543705 memory.go:184] no items to output this cycle
I0320 01:37:53.409818 543705 cpu.go:275] no items to output this cycle
E0320 01:38:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:38:03.409775 543705 memory.go:184] no items to output this cycle
I0320 01:38:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 01:38:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:38:13.409800 543705 memory.go:191] Add success.
I0320 01:38:13.409821 543705 cpu.go:282] Add success.
W0320 01:38:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:38:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:38:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:38:13.420169 543705 net.go:648] Add success.
I0320 01:38:13.423046 543705 net.go:770] primary dev: ETH0
I0320 01:38:13.423059 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:38:13.423072 543705 net.go:698] Add success.
I0320 01:38:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:38:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:38:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0320 01:38:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:38:14.456581 543705 disk_worker.go:494] system disk:vda1
I0320 01:38:14.456612 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:38:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:38:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:38:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:38:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:38:16.472370 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:38:21.945676 543705 disk_info.go:125] begin check local disk info of client
I0320 01:38:21.948137 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:38:21.948143 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5040 0xc0000c5080]
E0320 01:38:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:38:23.409792 543705 memory.go:184] no items to output this cycle
I0320 01:38:23.409807 543705 cpu.go:275] no items to output this cycle
E0320 01:38:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:38:33.409800 543705 memory.go:184] no items to output this cycle
I0320 01:38:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 01:38:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:38:43.409815 543705 memory.go:191] Add success.
I0320 01:38:43.409823 543705 cpu.go:282] Add success.
I0320 01:38:43.419873 543705 net.go:648] Add success.
I0320 01:38:43.422514 543705 net.go:770] primary dev: ETH0
I0320 01:38:43.422526 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:38:43.422537 543705 net.go:698] Add success.
I0320 01:38:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:38:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:38:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:38:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:38:53.409785 543705 memory.go:184] no items to output this cycle
I0320 01:38:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 01:39:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:39:03.409798 543705 memory.go:184] no items to output this cycle
I0320 01:39:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 01:39:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:39:13.409890 543705 memory.go:191] Add success.
W0320 01:39:13.409921 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 01:39:13.409933 543705 cpu.go:282] Add success.
W0320 01:39:13.409934 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:39:13.409938 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:39:13.419754 543705 net.go:648] Add success.
I0320 01:39:13.422893 543705 net.go:770] primary dev: ETH0
I0320 01:39:13.422908 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:39:13.422921 543705 net.go:698] Add success.
I0320 01:39:13.468691 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f8de55dc-a447-4848-9c9e-37f517a9152b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 01:39:13.468724 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 01:39:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:39:14.455106 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:39:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 01:39:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:39:14.456622 543705 disk_worker.go:494] system disk:vda1
I0320 01:39:14.456651 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:39:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:39:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:39:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:39:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:39:16.472388 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:39:21.949677 543705 disk_info.go:125] begin check local disk info of client
I0320 01:39:21.952163 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:39:21.952170 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034de80 0xc00034dec0]
E0320 01:39:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:39:23.409774 543705 cpu.go:275] no items to output this cycle
I0320 01:39:23.409785 543705 memory.go:184] no items to output this cycle
E0320 01:39:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:39:33.409775 543705 cpu.go:275] no items to output this cycle
I0320 01:39:33.409790 543705 memory.go:184] no items to output this cycle
I0320 01:39:38.028832 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 01:39:38.028838 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 01:39:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:39:43.410674 543705 memory.go:191] Add success.
I0320 01:39:43.409823 543705 cpu.go:282] Add success.
I0320 01:39:43.420399 543705 net.go:648] Add success.
I0320 01:39:43.423186 543705 net.go:770] primary dev: ETH0
I0320 01:39:43.423202 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:39:43.423215 543705 net.go:698] Add success.
I0320 01:39:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:39:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:39:46.458056 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:39:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:39:53.409776 543705 memory.go:184] no items to output this cycle
I0320 01:39:53.409820 543705 cpu.go:275] no items to output this cycle
E0320 01:40:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:40:03.409774 543705 memory.go:184] no items to output this cycle
I0320 01:40:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 01:40:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:40:13.409810 543705 memory.go:191] Add success.
I0320 01:40:13.409817 543705 cpu.go:282] Add success.
W0320 01:40:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:40:13.413258 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:40:13.413263 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:40:13.419773 543705 net.go:648] Add success.
I0320 01:40:13.421437 543705 net.go:770] primary dev: ETH0
I0320 01:40:13.421452 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:40:13.421465 543705 net.go:698] Add success.
I0320 01:40:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:40:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:40:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 01:40:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:40:14.456594 543705 disk_worker.go:494] system disk:vda1
I0320 01:40:14.456625 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:40:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:40:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:40:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:40:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:40:16.472376 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:40:21.953674 543705 disk_info.go:125] begin check local disk info of client
I0320 01:40:21.956122 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:40:21.956129 543705 disk_info.go:196] parse disk info done, disk is : [0xc000472400 0xc000472440]
E0320 01:40:23.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:40:23.409757 543705 memory.go:184] no items to output this cycle
I0320 01:40:23.409797 543705 cpu.go:275] no items to output this cycle
E0320 01:40:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:40:33.409799 543705 memory.go:184] no items to output this cycle
I0320 01:40:33.409809 543705 cpu.go:275] no items to output this cycle
E0320 01:40:43.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:40:43.409774 543705 memory.go:191] Add success.
I0320 01:40:43.409808 543705 cpu.go:282] Add success.
I0320 01:40:43.419834 543705 net.go:648] Add success.
I0320 01:40:43.422871 543705 net.go:770] primary dev: ETH0
I0320 01:40:43.422884 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:40:43.422897 543705 net.go:698] Add success.
I0320 01:40:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:40:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:40:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:40:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:40:53.409784 543705 cpu.go:275] no items to output this cycle
I0320 01:40:53.409789 543705 memory.go:184] no items to output this cycle
E0320 01:41:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:41:03.409803 543705 memory.go:184] no items to output this cycle
I0320 01:41:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 01:41:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:41:13.409785 543705 memory.go:191] Add success.
W0320 01:41:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 01:41:13.409816 543705 cpu.go:282] Add success.
W0320 01:41:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:41:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:41:13.420311 543705 net.go:648] Add success.
I0320 01:41:13.423057 543705 net.go:770] primary dev: ETH0
I0320 01:41:13.423071 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:41:13.423082 543705 net.go:698] Add success.
I0320 01:41:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:41:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:41:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 01:41:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:41:14.456574 543705 disk_worker.go:494] system disk:vda1
I0320 01:41:14.456602 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:41:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:41:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:41:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:41:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:41:16.472390 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:41:21.957674 543705 disk_info.go:125] begin check local disk info of client
I0320 01:41:21.960211 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:41:21.960217 543705 disk_info.go:196] parse disk info done, disk is : [0xc000467c00 0xc000467c40]
E0320 01:41:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:41:23.409779 543705 memory.go:184] no items to output this cycle
I0320 01:41:23.409782 543705 cpu.go:275] no items to output this cycle
E0320 01:41:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:41:33.409798 543705 memory.go:184] no items to output this cycle
I0320 01:41:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 01:41:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:41:43.409817 543705 memory.go:191] Add success.
I0320 01:41:43.409831 543705 cpu.go:282] Add success.
I0320 01:41:43.420022 543705 net.go:648] Add success.
I0320 01:41:43.422807 543705 net.go:770] primary dev: ETH0
I0320 01:41:43.422822 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:41:43.422837 543705 net.go:698] Add success.
I0320 01:41:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:41:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:41:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:41:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:41:53.409782 543705 memory.go:184] no items to output this cycle
I0320 01:41:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 01:42:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:42:03.409773 543705 memory.go:184] no items to output this cycle
I0320 01:42:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 01:42:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:42:13.409805 543705 memory.go:191] Add success.
I0320 01:42:13.409812 543705 cpu.go:282] Add success.
W0320 01:42:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:42:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:42:13.409855 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:42:13.420038 543705 net.go:648] Add success.
I0320 01:42:13.422756 543705 net.go:770] primary dev: ETH0
I0320 01:42:13.422768 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:42:13.422781 543705 net.go:698] Add success.
I0320 01:42:13.470099 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8ae30a92-0dbb-4c7f-b005-77f27d8921a7","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 01:42:13.470132 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 01:42:14.455710 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:42:14.455726 543705 disk_worker.go:708] disk space is not compliant
W0320 01:42:14.455730 543705 disk_worker.go:728] disk inode is not compliant
E0320 01:42:14.456282 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 01:42:14.456291 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 01:42:14.456298 543705 custom_config.go:64] query custom config with name: gpu
I0320 01:42:14.457627 543705 disk_worker.go:494] system disk:vda1
I0320 01:42:14.457675 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 01:42:15.456847 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 01:42:15.456856 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:42:16.457932 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 01:42:16.457932 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 01:42:16.457989 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:42:16.458009 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:42:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:42:21.961673 543705 disk_info.go:125] begin check local disk info of client
I0320 01:42:21.964164 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:42:21.964171 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bd280 0xc0002bd2c0]
E0320 01:42:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:42:23.409792 543705 memory.go:184] no items to output this cycle
I0320 01:42:23.409806 543705 cpu.go:275] no items to output this cycle
E0320 01:42:33.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:42:33.409807 543705 cpu.go:275] no items to output this cycle
I0320 01:42:33.409812 543705 memory.go:184] no items to output this cycle
I0320 01:42:38.029727 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 01:42:38.029733 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 01:42:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:42:43.410637 543705 memory.go:191] Add success.
I0320 01:42:43.409782 543705 cpu.go:282] Add success.
I0320 01:42:43.420345 543705 net.go:648] Add success.
I0320 01:42:43.422970 543705 net.go:770] primary dev: ETH0
I0320 01:42:43.422986 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:42:43.423000 543705 net.go:698] Add success.
I0320 01:42:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:42:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:42:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:42:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:42:53.409775 543705 memory.go:184] no items to output this cycle
I0320 01:42:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 01:43:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:43:03.409782 543705 memory.go:184] no items to output this cycle
I0320 01:43:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 01:43:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:43:13.409791 543705 memory.go:191] Add success.
W0320 01:43:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 01:43:13.409819 543705 cpu.go:282] Add success.
W0320 01:43:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:43:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:43:13.420218 543705 net.go:648] Add success.
I0320 01:43:13.422919 543705 net.go:770] primary dev: ETH0
I0320 01:43:13.422935 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:43:13.422949 543705 net.go:698] Add success.
I0320 01:43:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:43:14.455199 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:43:14.455211 543705 disk_worker.go:708] disk space is not compliant
W0320 01:43:14.455214 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:43:14.457768 543705 disk_worker.go:494] system disk:vda1
I0320 01:43:14.457804 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:43:15.455946 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:43:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:43:16.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:43:16.458081 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:43:16.472429 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:43:21.965676 543705 disk_info.go:125] begin check local disk info of client
I0320 01:43:21.968608 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:43:21.968614 543705 disk_info.go:196] parse disk info done, disk is : [0xc000394940 0xc000394980]
E0320 01:43:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:43:23.409780 543705 cpu.go:275] no items to output this cycle
I0320 01:43:23.409782 543705 memory.go:184] no items to output this cycle
E0320 01:43:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:43:33.409784 543705 memory.go:184] no items to output this cycle
I0320 01:43:33.409790 543705 cpu.go:275] no items to output this cycle
E0320 01:43:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:43:43.409815 543705 memory.go:191] Add success.
I0320 01:43:43.409821 543705 cpu.go:282] Add success.
I0320 01:43:43.419883 543705 net.go:648] Add success.
I0320 01:43:43.422830 543705 net.go:770] primary dev: ETH0
I0320 01:43:43.422843 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:43:43.422855 543705 net.go:698] Add success.
I0320 01:43:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:43:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:43:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:43:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:43:53.409787 543705 memory.go:184] no items to output this cycle
I0320 01:43:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 01:44:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:44:03.409799 543705 memory.go:184] no items to output this cycle
I0320 01:44:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 01:44:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:44:13.409815 543705 memory.go:191] Add success.
I0320 01:44:13.409815 543705 cpu.go:282] Add success.
W0320 01:44:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:44:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:44:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:44:13.420127 543705 net.go:648] Add success.
I0320 01:44:13.422851 543705 net.go:770] primary dev: ETH0
I0320 01:44:13.422866 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:44:13.422880 543705 net.go:698] Add success.
I0320 01:44:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:44:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:44:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 01:44:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:44:14.456510 543705 disk_worker.go:494] system disk:vda1
I0320 01:44:14.456558 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:44:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:44:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:44:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:44:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:44:16.472439 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:44:21.969673 543705 disk_info.go:125] begin check local disk info of client
I0320 01:44:21.972197 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:44:21.972204 543705 disk_info.go:196] parse disk info done, disk is : [0xc0005702c0 0xc000570300]
E0320 01:44:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:44:23.409773 543705 memory.go:184] no items to output this cycle
I0320 01:44:23.409778 543705 cpu.go:275] no items to output this cycle
E0320 01:44:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:44:33.409800 543705 memory.go:184] no items to output this cycle
I0320 01:44:33.409807 543705 cpu.go:275] no items to output this cycle
E0320 01:44:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:44:43.409779 543705 memory.go:191] Add success.
I0320 01:44:43.409799 543705 cpu.go:282] Add success.
I0320 01:44:43.419881 543705 net.go:648] Add success.
I0320 01:44:43.422766 543705 net.go:770] primary dev: ETH0
I0320 01:44:43.422779 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:44:43.422793 543705 net.go:698] Add success.
I0320 01:44:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:44:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:44:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:44:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:44:53.409777 543705 memory.go:184] no items to output this cycle
I0320 01:44:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 01:45:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:45:03.409774 543705 memory.go:184] no items to output this cycle
I0320 01:45:03.409777 543705 cpu.go:275] no items to output this cycle
E0320 01:45:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:45:13.409821 543705 memory.go:191] Add success.
I0320 01:45:13.409824 543705 cpu.go:282] Add success.
W0320 01:45:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:45:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:45:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:45:13.420149 543705 net.go:648] Add success.
I0320 01:45:13.423118 543705 net.go:770] primary dev: ETH0
I0320 01:45:13.423133 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:45:13.423147 543705 net.go:698] Add success.
I0320 01:45:14.069134 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"af7acb95-914d-4141-86cd-69d9ea6c1b1c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 01:45:14.069184 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 01:45:14.454693 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:45:14.454849 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:45:14.454935 543705 disk_worker.go:708] disk space is not compliant
W0320 01:45:14.454939 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:45:14.456495 543705 disk_worker.go:494] system disk:vda1
I0320 01:45:14.456530 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:45:15.455980 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:45:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:45:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:45:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:45:16.472391 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:45:21.973668 543705 disk_info.go:125] begin check local disk info of client
I0320 01:45:21.976266 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:45:21.976274 543705 disk_info.go:196] parse disk info done, disk is : [0xc000353140 0xc000353180]
E0320 01:45:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:45:23.409768 543705 memory.go:184] no items to output this cycle
I0320 01:45:23.409922 543705 cpu.go:275] no items to output this cycle
E0320 01:45:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:45:33.409777 543705 memory.go:184] no items to output this cycle
I0320 01:45:33.409810 543705 cpu.go:275] no items to output this cycle
I0320 01:45:38.029871 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 01:45:38.029877 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 01:45:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:45:43.410744 543705 memory.go:191] Add success.
I0320 01:45:43.409808 543705 cpu.go:282] Add success.
I0320 01:45:43.420468 543705 net.go:648] Add success.
I0320 01:45:43.423212 543705 net.go:770] primary dev: ETH0
I0320 01:45:43.423227 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:45:43.423241 543705 net.go:698] Add success.
I0320 01:45:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:45:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:45:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:45:53.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:45:53.409816 543705 memory.go:184] no items to output this cycle
I0320 01:45:53.409823 543705 cpu.go:275] no items to output this cycle
E0320 01:46:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:46:03.409787 543705 memory.go:184] no items to output this cycle
I0320 01:46:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 01:46:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:46:13.409796 543705 cpu.go:282] Add success.
I0320 01:46:13.409798 543705 memory.go:191] Add success.
W0320 01:46:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:46:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:46:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:46:13.420065 543705 net.go:648] Add success.
I0320 01:46:13.422753 543705 net.go:770] primary dev: ETH0
I0320 01:46:13.422767 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:46:13.422780 543705 net.go:698] Add success.
I0320 01:46:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:46:14.455187 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:46:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 01:46:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:46:14.456586 543705 disk_worker.go:494] system disk:vda1
I0320 01:46:14.456631 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:46:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:46:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:46:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:46:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:46:16.472388 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:46:21.977673 543705 disk_info.go:125] begin check local disk info of client
I0320 01:46:21.980082 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:46:21.980089 543705 disk_info.go:196] parse disk info done, disk is : [0xc00029ab80 0xc00029abc0]
E0320 01:46:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:46:23.409772 543705 memory.go:184] no items to output this cycle
I0320 01:46:23.409777 543705 cpu.go:275] no items to output this cycle
E0320 01:46:33.409829 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:46:33.409868 543705 memory.go:184] no items to output this cycle
I0320 01:46:33.409997 543705 cpu.go:275] no items to output this cycle
E0320 01:46:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:46:43.409790 543705 memory.go:191] Add success.
I0320 01:46:43.409806 543705 cpu.go:282] Add success.
I0320 01:46:43.419880 543705 net.go:648] Add success.
I0320 01:46:43.422536 543705 net.go:770] primary dev: ETH0
I0320 01:46:43.422551 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:46:43.422565 543705 net.go:698] Add success.
I0320 01:46:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:46:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:46:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:46:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:46:53.409806 543705 memory.go:184] no items to output this cycle
I0320 01:46:53.409818 543705 cpu.go:275] no items to output this cycle
E0320 01:47:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:47:03.409787 543705 memory.go:184] no items to output this cycle
I0320 01:47:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 01:47:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:47:13.409806 543705 memory.go:191] Add success.
I0320 01:47:13.409810 543705 cpu.go:282] Add success.
W0320 01:47:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:47:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:47:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:47:13.420056 543705 net.go:648] Add success.
I0320 01:47:13.423144 543705 net.go:770] primary dev: ETH0
I0320 01:47:13.423163 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:47:13.423177 543705 net.go:698] Add success.
I0320 01:47:13.453744 543705 event_worker.go:152] Polling the log file for events...
W0320 01:47:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:47:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 01:47:14.455174 543705 disk_worker.go:728] disk inode is not compliant
E0320 01:47:14.456925 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 01:47:14.456934 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 01:47:14.456940 543705 custom_config.go:64] query custom config with name: gpu
I0320 01:47:14.457010 543705 disk_worker.go:494] system disk:vda1
I0320 01:47:14.457055 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 01:47:15.456823 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 01:47:15.456833 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:47:16.457926 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 01:47:16.457926 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 01:47:16.457982 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:47:16.458000 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:47:16.472318 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:47:21.981681 543705 disk_info.go:125] begin check local disk info of client
I0320 01:47:21.984083 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:47:21.984089 543705 disk_info.go:196] parse disk info done, disk is : [0xc000270340 0xc000270380]
E0320 01:47:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:47:23.409775 543705 memory.go:184] no items to output this cycle
I0320 01:47:23.409777 543705 cpu.go:275] no items to output this cycle
E0320 01:47:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:47:33.409786 543705 memory.go:184] no items to output this cycle
I0320 01:47:33.409794 543705 cpu.go:275] no items to output this cycle
E0320 01:47:43.409873 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:47:43.409903 543705 memory.go:191] Add success.
I0320 01:47:43.409983 543705 cpu.go:282] Add success.
I0320 01:47:43.419725 543705 net.go:648] Add success.
I0320 01:47:43.422306 543705 net.go:770] primary dev: ETH0
I0320 01:47:43.422320 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:47:43.422334 543705 net.go:698] Add success.
I0320 01:47:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:47:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:47:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:47:53.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:47:53.409816 543705 memory.go:184] no items to output this cycle
I0320 01:47:53.409826 543705 cpu.go:275] no items to output this cycle
E0320 01:48:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:48:03.409775 543705 memory.go:184] no items to output this cycle
I0320 01:48:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 01:48:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:48:13.409783 543705 memory.go:191] Add success.
I0320 01:48:13.409804 543705 cpu.go:282] Add success.
W0320 01:48:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:48:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:48:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:48:13.420265 543705 net.go:648] Add success.
I0320 01:48:13.423189 543705 net.go:770] primary dev: ETH0
I0320 01:48:13.423202 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:48:13.423213 543705 net.go:698] Add success.
I0320 01:48:13.585986 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d788d7ad-828e-4204-92de-6461dd5d7ccb","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 01:48:13.586019 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 01:48:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:48:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:48:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 01:48:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:48:14.456678 543705 disk_worker.go:494] system disk:vda1
I0320 01:48:14.456710 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:48:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:48:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:48:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:48:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:48:16.472390 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:48:21.985676 543705 disk_info.go:125] begin check local disk info of client
I0320 01:48:21.988083 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:48:21.988090 543705 disk_info.go:196] parse disk info done, disk is : [0xc000270dc0 0xc000270e00]
E0320 01:48:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:48:23.409787 543705 memory.go:184] no items to output this cycle
I0320 01:48:23.409800 543705 cpu.go:275] no items to output this cycle
E0320 01:48:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:48:33.409780 543705 memory.go:184] no items to output this cycle
I0320 01:48:33.409788 543705 cpu.go:275] no items to output this cycle
I0320 01:48:38.030020 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 01:48:38.030027 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 01:48:43.409870 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:48:43.410836 543705 memory.go:191] Add success.
I0320 01:48:43.410075 543705 cpu.go:282] Add success.
I0320 01:48:43.419754 543705 net.go:648] Add success.
I0320 01:48:43.422555 543705 net.go:770] primary dev: ETH0
I0320 01:48:43.422568 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:48:43.422580 543705 net.go:698] Add success.
I0320 01:48:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:48:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:48:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:48:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:48:53.409805 543705 memory.go:184] no items to output this cycle
I0320 01:48:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 01:49:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:49:03.409806 543705 memory.go:184] no items to output this cycle
I0320 01:49:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 01:49:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:49:13.409798 543705 memory.go:191] Add success.
I0320 01:49:13.409801 543705 cpu.go:282] Add success.
W0320 01:49:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:49:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:49:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:49:13.420111 543705 net.go:648] Add success.
I0320 01:49:13.422902 543705 net.go:770] primary dev: ETH0
I0320 01:49:13.422916 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:49:13.422928 543705 net.go:698] Add success.
I0320 01:49:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:49:14.455193 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:49:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0320 01:49:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:49:14.456604 543705 disk_worker.go:494] system disk:vda1
I0320 01:49:14.456633 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:49:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:49:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:49:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:49:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:49:16.472402 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:49:21.989674 543705 disk_info.go:125] begin check local disk info of client
I0320 01:49:21.992084 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:49:21.992090 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034d100 0xc00034d140]
E0320 01:49:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:49:23.409778 543705 memory.go:184] no items to output this cycle
I0320 01:49:23.409791 543705 cpu.go:275] no items to output this cycle
E0320 01:49:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:49:33.409784 543705 memory.go:184] no items to output this cycle
I0320 01:49:33.409790 543705 cpu.go:275] no items to output this cycle
E0320 01:49:43.409880 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:49:43.409903 543705 cpu.go:282] Add success.
I0320 01:49:43.409905 543705 memory.go:191] Add success.
I0320 01:49:43.419745 543705 net.go:648] Add success.
I0320 01:49:43.422390 543705 net.go:770] primary dev: ETH0
I0320 01:49:43.422403 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:49:43.422414 543705 net.go:698] Add success.
I0320 01:49:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:49:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:49:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:49:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:49:53.409795 543705 memory.go:184] no items to output this cycle
I0320 01:49:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 01:50:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:50:03.409778 543705 memory.go:184] no items to output this cycle
I0320 01:50:03.409802 543705 cpu.go:275] no items to output this cycle
E0320 01:50:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:50:13.409827 543705 memory.go:191] Add success.
I0320 01:50:13.409832 543705 cpu.go:282] Add success.
W0320 01:50:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:50:13.409879 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:50:13.409883 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:50:13.420145 543705 net.go:648] Add success.
I0320 01:50:13.423046 543705 net.go:770] primary dev: ETH0
I0320 01:50:13.423061 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:50:13.423075 543705 net.go:698] Add success.
I0320 01:50:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:50:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:50:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 01:50:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:50:14.456495 543705 disk_worker.go:494] system disk:vda1
I0320 01:50:14.456543 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:50:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:50:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:50:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:50:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:50:16.472455 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:50:21.993671 543705 disk_info.go:125] begin check local disk info of client
I0320 01:50:21.996129 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:50:21.996135 543705 disk_info.go:196] parse disk info done, disk is : [0xc00057c100 0xc00057c140]
E0320 01:50:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:50:23.409782 543705 memory.go:184] no items to output this cycle
I0320 01:50:23.409785 543705 cpu.go:275] no items to output this cycle
E0320 01:50:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:50:33.409804 543705 memory.go:184] no items to output this cycle
I0320 01:50:33.409821 543705 cpu.go:275] no items to output this cycle
E0320 01:50:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:50:43.409798 543705 memory.go:191] Add success.
I0320 01:50:43.409798 543705 cpu.go:282] Add success.
I0320 01:50:43.420019 543705 net.go:648] Add success.
I0320 01:50:43.422962 543705 net.go:770] primary dev: ETH0
I0320 01:50:43.422975 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:50:43.422986 543705 net.go:698] Add success.
I0320 01:50:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:50:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:50:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:50:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:50:53.409795 543705 memory.go:184] no items to output this cycle
I0320 01:50:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 01:51:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:51:03.409773 543705 memory.go:184] no items to output this cycle
I0320 01:51:03.409808 543705 cpu.go:275] no items to output this cycle
E0320 01:51:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:51:13.409828 543705 memory.go:191] Add success.
I0320 01:51:13.409833 543705 cpu.go:282] Add success.
W0320 01:51:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:51:13.409879 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:51:13.409883 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:51:13.420081 543705 net.go:648] Add success.
I0320 01:51:13.422897 543705 net.go:770] primary dev: ETH0
I0320 01:51:13.422910 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:51:13.422922 543705 net.go:698] Add success.
I0320 01:51:13.469632 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4c41fffe-a196-48a8-8441-c67be94060df","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 01:51:13.469684 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 01:51:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:51:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:51:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0320 01:51:14.455171 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:51:14.456524 543705 disk_worker.go:494] system disk:vda1
I0320 01:51:14.456577 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:51:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:51:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:51:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:51:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:51:16.472372 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:51:21.997675 543705 disk_info.go:125] begin check local disk info of client
I0320 01:51:22.000091 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:51:22.000098 543705 disk_info.go:196] parse disk info done, disk is : [0xc000376e00 0xc000376e40]
E0320 01:51:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:51:23.409791 543705 memory.go:184] no items to output this cycle
I0320 01:51:23.409814 543705 cpu.go:275] no items to output this cycle
E0320 01:51:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:51:33.409811 543705 memory.go:184] no items to output this cycle
I0320 01:51:33.409825 543705 cpu.go:275] no items to output this cycle
I0320 01:51:38.032863 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 01:51:38.032870 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 01:51:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:51:43.410667 543705 memory.go:191] Add success.
I0320 01:51:43.409806 543705 cpu.go:282] Add success.
I0320 01:51:43.420649 543705 net.go:648] Add success.
I0320 01:51:43.423521 543705 net.go:770] primary dev: ETH0
I0320 01:51:43.423536 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:51:43.423549 543705 net.go:698] Add success.
I0320 01:51:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:51:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:51:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:51:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:51:53.409782 543705 memory.go:184] no items to output this cycle
I0320 01:51:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 01:52:03.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:52:03.409810 543705 memory.go:184] no items to output this cycle
I0320 01:52:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 01:52:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:52:13.409823 543705 memory.go:191] Add success.
I0320 01:52:13.409825 543705 cpu.go:282] Add success.
W0320 01:52:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:52:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:52:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:52:13.420278 543705 net.go:648] Add success.
I0320 01:52:13.422974 543705 net.go:770] primary dev: ETH0
I0320 01:52:13.422988 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:52:13.422999 543705 net.go:698] Add success.
W0320 01:52:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:52:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 01:52:14.455194 543705 disk_worker.go:728] disk inode is not compliant
E0320 01:52:14.456969 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 01:52:14.456980 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 01:52:14.456986 543705 custom_config.go:64] query custom config with name: gpu
I0320 01:52:14.457042 543705 disk_worker.go:494] system disk:vda1
I0320 01:52:14.457091 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 01:52:15.456792 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 01:52:15.456802 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:52:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 01:52:16.457973 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 01:52:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:52:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:52:16.472366 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:52:22.001674 543705 disk_info.go:125] begin check local disk info of client
I0320 01:52:22.004102 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:52:22.004107 543705 disk_info.go:196] parse disk info done, disk is : [0xc000377540 0xc000377580]
E0320 01:52:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:52:23.409790 543705 memory.go:184] no items to output this cycle
I0320 01:52:23.409803 543705 cpu.go:275] no items to output this cycle
E0320 01:52:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:52:33.409811 543705 memory.go:184] no items to output this cycle
I0320 01:52:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 01:52:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:52:43.409788 543705 memory.go:191] Add success.
I0320 01:52:43.409789 543705 cpu.go:282] Add success.
I0320 01:52:43.420256 543705 net.go:648] Add success.
I0320 01:52:43.422984 543705 net.go:770] primary dev: ETH0
I0320 01:52:43.422997 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:52:43.423009 543705 net.go:698] Add success.
I0320 01:52:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:52:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:52:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:52:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:52:53.409784 543705 memory.go:184] no items to output this cycle
I0320 01:52:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 01:53:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:53:03.409780 543705 memory.go:184] no items to output this cycle
I0320 01:53:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 01:53:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:53:13.409791 543705 memory.go:191] Add success.
I0320 01:53:13.409808 543705 cpu.go:282] Add success.
W0320 01:53:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:53:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:53:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:53:13.420175 543705 net.go:648] Add success.
I0320 01:53:13.422782 543705 net.go:770] primary dev: ETH0
I0320 01:53:13.422795 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:53:13.422808 543705 net.go:698] Add success.
I0320 01:53:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:53:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:53:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0320 01:53:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:53:14.456600 543705 disk_worker.go:494] system disk:vda1
I0320 01:53:14.456628 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:53:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:53:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:53:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:53:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:53:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:53:22.005671 543705 disk_info.go:125] begin check local disk info of client
I0320 01:53:22.008101 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:53:22.008107 543705 disk_info.go:196] parse disk info done, disk is : [0xc000587f00 0xc000587f40]
E0320 01:53:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:53:23.409774 543705 cpu.go:275] no items to output this cycle
I0320 01:53:23.409787 543705 memory.go:184] no items to output this cycle
E0320 01:53:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:53:33.409801 543705 memory.go:184] no items to output this cycle
I0320 01:53:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 01:53:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:53:43.409790 543705 memory.go:191] Add success.
I0320 01:53:43.409791 543705 cpu.go:282] Add success.
I0320 01:53:43.420274 543705 net.go:648] Add success.
I0320 01:53:43.423222 543705 net.go:770] primary dev: ETH0
I0320 01:53:43.423235 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:53:43.423246 543705 net.go:698] Add success.
I0320 01:53:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:53:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:53:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:53:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:53:53.409809 543705 memory.go:184] no items to output this cycle
I0320 01:53:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 01:54:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:54:03.409783 543705 memory.go:184] no items to output this cycle
I0320 01:54:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 01:54:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:54:13.409822 543705 memory.go:191] Add success.
I0320 01:54:13.409824 543705 cpu.go:282] Add success.
W0320 01:54:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:54:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:54:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:54:13.420154 543705 net.go:648] Add success.
I0320 01:54:13.422891 543705 net.go:770] primary dev: ETH0
I0320 01:54:13.422904 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:54:13.422916 543705 net.go:698] Add success.
I0320 01:54:13.470052 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"02132184-4292-4262-9a2a-dd493d7152a3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 01:54:13.470087 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 01:54:14.454983 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:54:14.455119 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:54:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 01:54:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:54:14.456504 543705 disk_worker.go:494] system disk:vda1
I0320 01:54:14.456551 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:54:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:54:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:54:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:54:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:54:16.472394 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:54:22.009675 543705 disk_info.go:125] begin check local disk info of client
I0320 01:54:22.012145 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:54:22.012151 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a8640 0xc0002a8680]
E0320 01:54:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:54:23.409788 543705 memory.go:184] no items to output this cycle
I0320 01:54:23.409802 543705 cpu.go:275] no items to output this cycle
E0320 01:54:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:54:33.409808 543705 memory.go:184] no items to output this cycle
I0320 01:54:33.409822 543705 cpu.go:275] no items to output this cycle
I0320 01:54:38.033740 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 01:54:38.033747 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 01:54:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:54:43.410703 543705 memory.go:191] Add success.
I0320 01:54:43.409794 543705 cpu.go:282] Add success.
I0320 01:54:43.420460 543705 net.go:648] Add success.
I0320 01:54:43.423002 543705 net.go:770] primary dev: ETH0
I0320 01:54:43.423016 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:54:43.423029 543705 net.go:698] Add success.
I0320 01:54:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:54:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:54:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:54:53.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:54:53.409814 543705 memory.go:184] no items to output this cycle
I0320 01:54:53.409825 543705 cpu.go:275] no items to output this cycle
E0320 01:55:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:55:03.409781 543705 memory.go:184] no items to output this cycle
I0320 01:55:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 01:55:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:55:13.409790 543705 memory.go:191] Add success.
W0320 01:55:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 01:55:13.409820 543705 cpu.go:282] Add success.
W0320 01:55:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:55:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:55:13.420127 543705 net.go:648] Add success.
I0320 01:55:13.422706 543705 net.go:770] primary dev: ETH0
I0320 01:55:13.422720 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:55:13.422734 543705 net.go:698] Add success.
I0320 01:55:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:55:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:55:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0320 01:55:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:55:14.456586 543705 disk_worker.go:494] system disk:vda1
I0320 01:55:14.456617 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:55:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:55:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:55:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:55:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:55:16.472380 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:55:22.013672 543705 disk_info.go:125] begin check local disk info of client
I0320 01:55:22.016038 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:55:22.016044 543705 disk_info.go:196] parse disk info done, disk is : [0xc000364380 0xc0003643c0]
E0320 01:55:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:55:23.409784 543705 memory.go:184] no items to output this cycle
I0320 01:55:23.409798 543705 cpu.go:275] no items to output this cycle
E0320 01:55:33.409863 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:55:33.409887 543705 cpu.go:275] no items to output this cycle
I0320 01:55:33.409921 543705 memory.go:184] no items to output this cycle
E0320 01:55:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:55:43.409815 543705 memory.go:191] Add success.
I0320 01:55:43.409826 543705 cpu.go:282] Add success.
I0320 01:55:43.420012 543705 net.go:648] Add success.
I0320 01:55:43.422752 543705 net.go:770] primary dev: ETH0
I0320 01:55:43.422767 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:55:43.422781 543705 net.go:698] Add success.
I0320 01:55:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:55:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:55:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:55:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:55:53.409778 543705 memory.go:184] no items to output this cycle
I0320 01:55:53.409800 543705 cpu.go:275] no items to output this cycle
I0320 01:56:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 01:56:03.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:56:03.409814 543705 memory.go:184] no items to output this cycle
E0320 01:56:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:56:13.409812 543705 memory.go:191] Add success.
I0320 01:56:13.409821 543705 cpu.go:282] Add success.
W0320 01:56:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:56:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:56:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:56:13.420173 543705 net.go:648] Add success.
I0320 01:56:13.422781 543705 net.go:770] primary dev: ETH0
I0320 01:56:13.422796 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:56:13.422810 543705 net.go:698] Add success.
I0320 01:56:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:56:14.455133 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:56:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0320 01:56:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:56:14.456678 543705 disk_worker.go:494] system disk:vda1
I0320 01:56:14.456727 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:56:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:56:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:56:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:56:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:56:16.472440 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:56:22.017676 543705 disk_info.go:125] begin check local disk info of client
I0320 01:56:22.020108 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:56:22.020114 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003045c0 0xc000304600]
E0320 01:56:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:56:23.409785 543705 memory.go:184] no items to output this cycle
I0320 01:56:23.409801 543705 cpu.go:275] no items to output this cycle
E0320 01:56:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:56:33.409784 543705 memory.go:184] no items to output this cycle
I0320 01:56:33.409785 543705 cpu.go:275] no items to output this cycle
E0320 01:56:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:56:43.409817 543705 memory.go:191] Add success.
I0320 01:56:43.409829 543705 cpu.go:282] Add success.
I0320 01:56:43.419883 543705 net.go:648] Add success.
I0320 01:56:43.422707 543705 net.go:770] primary dev: ETH0
I0320 01:56:43.422720 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:56:43.422732 543705 net.go:698] Add success.
I0320 01:56:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:56:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:56:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:56:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:56:53.409779 543705 memory.go:184] no items to output this cycle
I0320 01:56:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 01:57:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:57:03.409785 543705 memory.go:184] no items to output this cycle
I0320 01:57:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 01:57:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:57:13.409822 543705 memory.go:191] Add success.
I0320 01:57:13.409833 543705 cpu.go:282] Add success.
W0320 01:57:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:57:13.409873 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:57:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:57:13.420145 543705 net.go:648] Add success.
I0320 01:57:13.422905 543705 net.go:770] primary dev: ETH0
I0320 01:57:13.422918 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:57:13.422930 543705 net.go:698] Add success.
I0320 01:57:13.429293 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 01:57:13.453479 543705 event_worker.go:152] Polling the log file for events...
I0320 01:57:13.469143 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"732ce3e2-a68e-49c3-b5f1-3040ef3c1b1f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 01:57:13.469180 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 01:57:14.455158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:57:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0320 01:57:14.455171 543705 disk_worker.go:728] disk inode is not compliant
E0320 01:57:14.456595 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 01:57:14.456604 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 01:57:14.456609 543705 custom_config.go:64] query custom config with name: gpu
I0320 01:57:14.457164 543705 disk_worker.go:494] system disk:vda1
I0320 01:57:14.457197 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 01:57:15.456837 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 01:57:15.456845 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:57:16.457911 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 01:57:16.457911 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 01:57:16.457967 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:57:16.457987 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:57:16.472301 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:57:22.021673 543705 disk_info.go:125] begin check local disk info of client
I0320 01:57:22.024064 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:57:22.024071 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa340 0xc0001aa380]
E0320 01:57:23.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:57:23.409759 543705 memory.go:184] no items to output this cycle
I0320 01:57:23.409796 543705 cpu.go:275] no items to output this cycle
E0320 01:57:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:57:33.409798 543705 memory.go:184] no items to output this cycle
I0320 01:57:33.409816 543705 cpu.go:275] no items to output this cycle
I0320 01:57:38.036866 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 01:57:38.036872 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 01:57:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:57:43.410669 543705 memory.go:191] Add success.
I0320 01:57:43.409800 543705 cpu.go:282] Add success.
I0320 01:57:43.420379 543705 net.go:648] Add success.
I0320 01:57:43.423232 543705 net.go:770] primary dev: ETH0
I0320 01:57:43.423245 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:57:43.423257 543705 net.go:698] Add success.
I0320 01:57:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:57:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:57:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:57:53.409803 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:57:53.409825 543705 memory.go:184] no items to output this cycle
I0320 01:57:53.409835 543705 cpu.go:275] no items to output this cycle
E0320 01:58:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:58:03.409791 543705 memory.go:184] no items to output this cycle
I0320 01:58:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 01:58:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:58:13.409794 543705 memory.go:191] Add success.
I0320 01:58:13.409795 543705 cpu.go:282] Add success.
W0320 01:58:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 01:58:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:58:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:58:13.420117 543705 net.go:648] Add success.
I0320 01:58:13.422797 543705 net.go:770] primary dev: ETH0
I0320 01:58:13.422810 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:58:13.422823 543705 net.go:698] Add success.
I0320 01:58:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:58:14.455201 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:58:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0320 01:58:14.455217 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:58:14.457387 543705 disk_worker.go:494] system disk:vda1
I0320 01:58:14.457432 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:58:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:58:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:58:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:58:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:58:16.472370 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:58:22.025671 543705 disk_info.go:125] begin check local disk info of client
I0320 01:58:22.028130 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:58:22.028137 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b3640 0xc0003b3680]
E0320 01:58:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:58:23.409789 543705 memory.go:184] no items to output this cycle
I0320 01:58:23.409802 543705 cpu.go:275] no items to output this cycle
E0320 01:58:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:58:33.409785 543705 memory.go:184] no items to output this cycle
I0320 01:58:33.409805 543705 cpu.go:275] no items to output this cycle
E0320 01:58:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:58:43.409787 543705 memory.go:191] Add success.
I0320 01:58:43.409794 543705 cpu.go:282] Add success.
I0320 01:58:43.419895 543705 net.go:648] Add success.
I0320 01:58:43.422839 543705 net.go:770] primary dev: ETH0
I0320 01:58:43.422852 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:58:43.422865 543705 net.go:698] Add success.
I0320 01:58:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:58:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:58:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:58:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:58:53.409812 543705 memory.go:184] no items to output this cycle
I0320 01:58:53.409829 543705 cpu.go:275] no items to output this cycle
E0320 01:59:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:59:03.409784 543705 memory.go:184] no items to output this cycle
I0320 01:59:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 01:59:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:59:13.409787 543705 memory.go:191] Add success.
W0320 01:59:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 01:59:13.409812 543705 cpu.go:282] Add success.
W0320 01:59:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 01:59:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 01:59:13.420083 543705 net.go:648] Add success.
I0320 01:59:13.423245 543705 net.go:770] primary dev: ETH0
I0320 01:59:13.423257 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:59:13.423269 543705 net.go:698] Add success.
I0320 01:59:14.453958 543705 custom_config.go:64] query custom config with name: gpu
W0320 01:59:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 01:59:14.455252 543705 disk_worker.go:708] disk space is not compliant
W0320 01:59:14.455255 543705 disk_worker.go:728] disk inode is not compliant
I0320 01:59:14.456638 543705 disk_worker.go:494] system disk:vda1
I0320 01:59:14.456666 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 01:59:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 01:59:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:59:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:59:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 01:59:16.472480 543705 disk_local_worker.go:436] Get disk info: []
I0320 01:59:22.029672 543705 disk_info.go:125] begin check local disk info of client
I0320 01:59:22.032106 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 01:59:22.032112 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b38c0 0xc0003b3900]
E0320 01:59:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:59:23.409772 543705 memory.go:184] no items to output this cycle
I0320 01:59:23.409774 543705 cpu.go:275] no items to output this cycle
E0320 01:59:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:59:33.409804 543705 memory.go:184] no items to output this cycle
I0320 01:59:33.409814 543705 cpu.go:275] no items to output this cycle
E0320 01:59:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:59:43.409788 543705 memory.go:191] Add success.
I0320 01:59:43.409790 543705 cpu.go:282] Add success.
I0320 01:59:43.419889 543705 net.go:648] Add success.
I0320 01:59:43.423010 543705 net.go:770] primary dev: ETH0
I0320 01:59:43.423026 543705 net.go:802] Send network stats successfully!,count is 6
I0320 01:59:43.423040 543705 net.go:698] Add success.
I0320 01:59:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 01:59:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 01:59:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 01:59:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 01:59:53.409789 543705 memory.go:184] no items to output this cycle
I0320 01:59:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 02:00:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:00:03.409770 543705 memory.go:184] no items to output this cycle
I0320 02:00:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 02:00:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:00:13.409787 543705 memory.go:191] Add success.
I0320 02:00:13.409793 543705 cpu.go:282] Add success.
W0320 02:00:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:00:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:00:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:00:13.420313 543705 net.go:648] Add success.
I0320 02:00:13.423212 543705 net.go:770] primary dev: ETH0
I0320 02:00:13.423225 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:00:13.423237 543705 net.go:698] Add success.
I0320 02:00:13.468578 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"6cb1c3c2-a3a0-4b17-a313-a3061ed4ed13","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 02:00:13.468611 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 02:00:14.454990 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:00:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:00:14.455238 543705 disk_worker.go:708] disk space is not compliant
W0320 02:00:14.455241 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:00:14.456821 543705 disk_worker.go:494] system disk:vda1
I0320 02:00:14.456866 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:00:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:00:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:00:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:00:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:00:16.472391 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:00:22.033675 543705 disk_info.go:125] begin check local disk info of client
I0320 02:00:22.036111 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:00:22.036117 543705 disk_info.go:196] parse disk info done, disk is : [0xc000483c40 0xc000483c80]
E0320 02:00:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:00:23.409789 543705 memory.go:184] no items to output this cycle
I0320 02:00:23.409800 543705 cpu.go:275] no items to output this cycle
E0320 02:00:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:00:33.409779 543705 memory.go:184] no items to output this cycle
I0320 02:00:33.409790 543705 cpu.go:275] no items to output this cycle
I0320 02:00:38.037753 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 02:00:38.037760 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 02:00:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:00:43.410723 543705 memory.go:191] Add success.
I0320 02:00:43.409839 543705 cpu.go:282] Add success.
I0320 02:00:43.420430 543705 net.go:648] Add success.
I0320 02:00:43.423214 543705 net.go:770] primary dev: ETH0
I0320 02:00:43.423229 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:00:43.423242 543705 net.go:698] Add success.
I0320 02:00:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:00:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:00:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:00:53.409804 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:00:53.409826 543705 memory.go:184] no items to output this cycle
I0320 02:00:53.409836 543705 cpu.go:275] no items to output this cycle
E0320 02:01:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:01:03.409809 543705 memory.go:184] no items to output this cycle
I0320 02:01:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 02:01:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:01:13.409893 543705 memory.go:191] Add success.
W0320 02:01:13.409925 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:01:13.409938 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:01:13.409941 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:01:13.409946 543705 cpu.go:282] Add success.
I0320 02:01:13.419757 543705 net.go:648] Add success.
I0320 02:01:13.422515 543705 net.go:770] primary dev: ETH0
I0320 02:01:13.422527 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:01:13.422540 543705 net.go:698] Add success.
I0320 02:01:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:01:14.455092 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:01:14.455155 543705 disk_worker.go:708] disk space is not compliant
W0320 02:01:14.455158 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:01:14.456495 543705 disk_worker.go:494] system disk:vda1
I0320 02:01:14.456539 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:01:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:01:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:01:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:01:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:01:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:01:22.037675 543705 disk_info.go:125] begin check local disk info of client
I0320 02:01:22.040087 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:01:22.040093 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa6c0 0xc0001aa700]
E0320 02:01:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:01:23.409796 543705 memory.go:184] no items to output this cycle
I0320 02:01:23.409809 543705 cpu.go:275] no items to output this cycle
E0320 02:01:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:01:33.409796 543705 memory.go:184] no items to output this cycle
I0320 02:01:33.409806 543705 cpu.go:275] no items to output this cycle
E0320 02:01:43.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:01:43.409831 543705 memory.go:191] Add success.
I0320 02:01:43.409832 543705 cpu.go:282] Add success.
I0320 02:01:43.419873 543705 net.go:648] Add success.
I0320 02:01:43.422585 543705 net.go:770] primary dev: ETH0
I0320 02:01:43.422614 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:01:43.422628 543705 net.go:698] Add success.
I0320 02:01:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:01:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:01:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:01:53.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:01:53.409812 543705 memory.go:184] no items to output this cycle
I0320 02:01:53.409823 543705 cpu.go:275] no items to output this cycle
E0320 02:02:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:02:03.409786 543705 memory.go:184] no items to output this cycle
I0320 02:02:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 02:02:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:02:13.409791 543705 memory.go:191] Add success.
W0320 02:02:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 02:02:13.409824 543705 cpu.go:282] Add success.
W0320 02:02:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:02:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:02:13.420235 543705 net.go:648] Add success.
I0320 02:02:13.422975 543705 net.go:770] primary dev: ETH0
I0320 02:02:13.422989 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:02:13.423000 543705 net.go:698] Add success.
W0320 02:02:14.455132 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:02:14.455221 543705 disk_worker.go:708] disk space is not compliant
W0320 02:02:14.455226 543705 disk_worker.go:728] disk inode is not compliant
E0320 02:02:14.456901 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 02:02:14.456911 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 02:02:14.456917 543705 custom_config.go:64] query custom config with name: gpu
I0320 02:02:14.458024 543705 disk_worker.go:494] system disk:vda1
I0320 02:02:14.458064 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 02:02:15.456894 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 02:02:15.456907 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:02:16.457953 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 02:02:16.457955 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 02:02:16.458008 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:02:16.458029 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:02:16.472479 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:02:22.041675 543705 disk_info.go:125] begin check local disk info of client
I0320 02:02:22.044138 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:02:22.044145 543705 disk_info.go:196] parse disk info done, disk is : [0xc000481280 0xc0004812c0]
E0320 02:02:23.409741 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:02:23.409763 543705 memory.go:184] no items to output this cycle
I0320 02:02:23.409800 543705 cpu.go:275] no items to output this cycle
E0320 02:02:33.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:02:33.409772 543705 memory.go:184] no items to output this cycle
I0320 02:02:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 02:02:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:02:43.409802 543705 memory.go:191] Add success.
I0320 02:02:43.409803 543705 cpu.go:282] Add success.
I0320 02:02:43.419968 543705 net.go:648] Add success.
I0320 02:02:43.422484 543705 net.go:770] primary dev: ETH0
I0320 02:02:43.422498 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:02:43.422509 543705 net.go:698] Add success.
I0320 02:02:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:02:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:02:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:02:53.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:02:53.409816 543705 memory.go:184] no items to output this cycle
I0320 02:02:53.409824 543705 cpu.go:275] no items to output this cycle
E0320 02:03:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:03:03.409811 543705 memory.go:184] no items to output this cycle
I0320 02:03:03.409829 543705 cpu.go:275] no items to output this cycle
E0320 02:03:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:03:13.409809 543705 memory.go:191] Add success.
I0320 02:03:13.409809 543705 cpu.go:282] Add success.
W0320 02:03:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:03:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:03:13.409856 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:03:13.420122 543705 net.go:648] Add success.
I0320 02:03:13.423005 543705 net.go:770] primary dev: ETH0
I0320 02:03:13.423020 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:03:13.423034 543705 net.go:698] Add success.
I0320 02:03:13.464221 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5e6bc3c8-41ed-482a-a3da-b472937c2c9a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 02:03:13.464255 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 02:03:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:03:14.455174 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:03:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 02:03:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:03:14.456663 543705 disk_worker.go:494] system disk:vda1
I0320 02:03:14.456692 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:03:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:03:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:03:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:03:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:03:16.472405 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:03:22.045671 543705 disk_info.go:125] begin check local disk info of client
I0320 02:03:22.048180 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:03:22.048187 543705 disk_info.go:196] parse disk info done, disk is : [0xc000463180 0xc0004631c0]
E0320 02:03:23.410706 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:03:23.410724 543705 memory.go:184] no items to output this cycle
I0320 02:03:23.410737 543705 cpu.go:275] no items to output this cycle
E0320 02:03:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:03:33.409781 543705 memory.go:184] no items to output this cycle
I0320 02:03:33.409788 543705 cpu.go:275] no items to output this cycle
I0320 02:03:38.040894 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 02:03:38.040901 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 02:03:43.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:03:43.410645 543705 memory.go:191] Add success.
I0320 02:03:43.409808 543705 cpu.go:282] Add success.
I0320 02:03:43.420329 543705 net.go:648] Add success.
I0320 02:03:43.423133 543705 net.go:770] primary dev: ETH0
I0320 02:03:43.423146 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:03:43.423158 543705 net.go:698] Add success.
I0320 02:03:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:03:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:03:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:03:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:03:53.409777 543705 memory.go:184] no items to output this cycle
I0320 02:03:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 02:04:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:04:03.409798 543705 memory.go:184] no items to output this cycle
I0320 02:04:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 02:04:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:04:13.409781 543705 memory.go:191] Add success.
W0320 02:04:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 02:04:13.409811 543705 cpu.go:282] Add success.
W0320 02:04:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:04:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:04:13.420136 543705 net.go:648] Add success.
I0320 02:04:13.423323 543705 net.go:770] primary dev: ETH0
I0320 02:04:13.423336 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:04:13.423347 543705 net.go:698] Add success.
I0320 02:04:14.454954 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:04:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:04:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0320 02:04:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:04:14.456555 543705 disk_worker.go:494] system disk:vda1
I0320 02:04:14.456583 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:04:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:04:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:04:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:04:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:04:16.472484 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:04:22.049675 543705 disk_info.go:125] begin check local disk info of client
I0320 02:04:22.052162 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:04:22.052168 543705 disk_info.go:196] parse disk info done, disk is : [0xc000463740 0xc000463780]
E0320 02:04:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:04:23.409796 543705 memory.go:184] no items to output this cycle
I0320 02:04:23.409810 543705 cpu.go:275] no items to output this cycle
E0320 02:04:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:04:33.409799 543705 memory.go:184] no items to output this cycle
I0320 02:04:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 02:04:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:04:43.409809 543705 memory.go:191] Add success.
I0320 02:04:43.409815 543705 cpu.go:282] Add success.
I0320 02:04:43.419864 543705 net.go:648] Add success.
I0320 02:04:43.422678 543705 net.go:770] primary dev: ETH0
I0320 02:04:43.422690 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:04:43.422704 543705 net.go:698] Add success.
I0320 02:04:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:04:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:04:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:04:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:04:53.409785 543705 memory.go:184] no items to output this cycle
I0320 02:04:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 02:05:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:05:03.409808 543705 memory.go:184] no items to output this cycle
I0320 02:05:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 02:05:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:05:13.409823 543705 memory.go:191] Add success.
I0320 02:05:13.409826 543705 cpu.go:282] Add success.
W0320 02:05:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:05:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:05:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:05:13.420563 543705 net.go:648] Add success.
I0320 02:05:13.423073 543705 net.go:770] primary dev: ETH0
I0320 02:05:13.423086 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:05:13.423099 543705 net.go:698] Add success.
I0320 02:05:14.453950 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:05:14.455227 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:05:14.455238 543705 disk_worker.go:708] disk space is not compliant
W0320 02:05:14.455240 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:05:14.456596 543705 disk_worker.go:494] system disk:vda1
I0320 02:05:14.456624 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:05:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:05:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:05:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:05:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:05:16.472436 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:05:22.053674 543705 disk_info.go:125] begin check local disk info of client
I0320 02:05:22.056073 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:05:22.056079 543705 disk_info.go:196] parse disk info done, disk is : [0xc00046d2c0 0xc00046d300]
E0320 02:05:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:05:23.409790 543705 memory.go:184] no items to output this cycle
I0320 02:05:23.409803 543705 cpu.go:275] no items to output this cycle
E0320 02:05:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:05:33.409771 543705 memory.go:184] no items to output this cycle
I0320 02:05:33.409790 543705 cpu.go:275] no items to output this cycle
E0320 02:05:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:05:43.409794 543705 memory.go:191] Add success.
I0320 02:05:43.409795 543705 cpu.go:282] Add success.
I0320 02:05:43.419868 543705 net.go:648] Add success.
I0320 02:05:43.422621 543705 net.go:770] primary dev: ETH0
I0320 02:05:43.422635 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:05:43.422648 543705 net.go:698] Add success.
I0320 02:05:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:05:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:05:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:05:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:05:53.409785 543705 memory.go:184] no items to output this cycle
I0320 02:05:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 02:06:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:06:03.409774 543705 memory.go:184] no items to output this cycle
I0320 02:06:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 02:06:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:06:13.409814 543705 memory.go:191] Add success.
I0320 02:06:13.409821 543705 cpu.go:282] Add success.
W0320 02:06:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:06:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:06:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:06:13.420063 543705 net.go:648] Add success.
I0320 02:06:13.422813 543705 net.go:770] primary dev: ETH0
I0320 02:06:13.422825 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:06:13.422836 543705 net.go:698] Add success.
I0320 02:06:13.474093 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"cf61cd08-58ae-4967-bc89-9396689e88fb","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 02:06:13.474124 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 02:06:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:06:14.455144 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:06:14.455226 543705 disk_worker.go:708] disk space is not compliant
W0320 02:06:14.455229 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:06:14.456743 543705 disk_worker.go:494] system disk:vda1
I0320 02:06:14.456770 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:06:15.455629 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:06:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:06:16.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:06:16.458085 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:06:16.472449 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:06:22.057675 543705 disk_info.go:125] begin check local disk info of client
I0320 02:06:22.060249 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:06:22.060255 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ff600 0xc0003ff640]
E0320 02:06:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:06:23.409781 543705 memory.go:184] no items to output this cycle
I0320 02:06:23.409791 543705 cpu.go:275] no items to output this cycle
E0320 02:06:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:06:33.409799 543705 memory.go:184] no items to output this cycle
I0320 02:06:33.409812 543705 cpu.go:275] no items to output this cycle
I0320 02:06:38.041741 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 02:06:38.041747 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 02:06:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:06:43.410690 543705 memory.go:191] Add success.
I0320 02:06:43.409797 543705 cpu.go:282] Add success.
I0320 02:06:43.420406 543705 net.go:648] Add success.
I0320 02:06:43.423143 543705 net.go:770] primary dev: ETH0
I0320 02:06:43.423164 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:06:43.423180 543705 net.go:698] Add success.
I0320 02:06:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:06:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:06:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:06:53.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:06:53.409819 543705 memory.go:184] no items to output this cycle
I0320 02:06:53.409826 543705 cpu.go:275] no items to output this cycle
E0320 02:07:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:07:03.409801 543705 memory.go:184] no items to output this cycle
I0320 02:07:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 02:07:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:07:13.409781 543705 memory.go:191] Add success.
W0320 02:07:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 02:07:13.409815 543705 cpu.go:282] Add success.
W0320 02:07:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:07:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:07:13.420448 543705 net.go:648] Add success.
I0320 02:07:13.423257 543705 net.go:770] primary dev: ETH0
I0320 02:07:13.423269 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:07:13.423281 543705 net.go:698] Add success.
I0320 02:07:13.452772 543705 event_worker.go:152] Polling the log file for events...
W0320 02:07:14.455135 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:07:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 02:07:14.455200 543705 disk_worker.go:728] disk inode is not compliant
E0320 02:07:14.456194 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 02:07:14.456204 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 02:07:14.456211 543705 custom_config.go:64] query custom config with name: gpu
I0320 02:07:14.457304 543705 disk_worker.go:494] system disk:vda1
I0320 02:07:14.457334 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 02:07:15.456820 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 02:07:15.456827 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:07:16.457931 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 02:07:16.457941 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 02:07:16.457985 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:07:16.458001 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:07:16.472335 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:07:22.061672 543705 disk_info.go:125] begin check local disk info of client
I0320 02:07:22.064045 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:07:22.064060 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034d180 0xc00034d1c0]
E0320 02:07:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:07:23.409790 543705 memory.go:184] no items to output this cycle
I0320 02:07:23.409804 543705 cpu.go:275] no items to output this cycle
E0320 02:07:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:07:33.409779 543705 memory.go:184] no items to output this cycle
I0320 02:07:33.409787 543705 cpu.go:275] no items to output this cycle
E0320 02:07:43.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:07:43.409775 543705 memory.go:191] Add success.
I0320 02:07:43.409811 543705 cpu.go:282] Add success.
I0320 02:07:43.419824 543705 net.go:648] Add success.
I0320 02:07:43.422926 543705 net.go:770] primary dev: ETH0
I0320 02:07:43.422939 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:07:43.422952 543705 net.go:698] Add success.
I0320 02:07:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:07:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:07:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:07:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:07:53.409788 543705 memory.go:184] no items to output this cycle
I0320 02:07:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 02:08:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:08:03.409781 543705 memory.go:184] no items to output this cycle
I0320 02:08:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 02:08:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:08:13.409790 543705 memory.go:191] Add success.
I0320 02:08:13.409796 543705 cpu.go:282] Add success.
W0320 02:08:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:08:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:08:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:08:13.420129 543705 net.go:648] Add success.
I0320 02:08:13.423200 543705 net.go:770] primary dev: ETH0
I0320 02:08:13.423214 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:08:13.423227 543705 net.go:698] Add success.
I0320 02:08:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:08:14.455134 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:08:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 02:08:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:08:14.456606 543705 disk_worker.go:494] system disk:vda1
I0320 02:08:14.456773 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:08:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:08:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:08:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:08:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:08:16.472433 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:08:22.065672 543705 disk_info.go:125] begin check local disk info of client
I0320 02:08:22.068145 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:08:22.068152 543705 disk_info.go:196] parse disk info done, disk is : [0xc00036a240 0xc00036a280]
E0320 02:08:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:08:23.409759 543705 memory.go:184] no items to output this cycle
I0320 02:08:23.409795 543705 cpu.go:275] no items to output this cycle
E0320 02:08:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:08:33.409768 543705 memory.go:184] no items to output this cycle
I0320 02:08:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 02:08:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:08:43.409789 543705 memory.go:191] Add success.
I0320 02:08:43.409794 543705 cpu.go:282] Add success.
I0320 02:08:43.419854 543705 net.go:648] Add success.
I0320 02:08:43.422325 543705 net.go:770] primary dev: ETH0
I0320 02:08:43.422338 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:08:43.422352 543705 net.go:698] Add success.
I0320 02:08:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:08:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:08:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:08:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:08:53.409801 543705 memory.go:184] no items to output this cycle
I0320 02:08:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 02:09:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:09:03.409771 543705 memory.go:184] no items to output this cycle
I0320 02:09:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 02:09:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:09:13.409797 543705 memory.go:191] Add success.
I0320 02:09:13.409799 543705 cpu.go:282] Add success.
W0320 02:09:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:09:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:09:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:09:13.420208 543705 net.go:648] Add success.
I0320 02:09:13.422811 543705 net.go:770] primary dev: ETH0
I0320 02:09:13.422825 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:09:13.422839 543705 net.go:698] Add success.
I0320 02:09:13.463888 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"379d19d8-1fe5-4df1-8bd0-7f91107facb2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 02:09:13.463923 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 02:09:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:09:14.455197 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:09:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0320 02:09:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:09:14.456737 543705 disk_worker.go:494] system disk:vda1
I0320 02:09:14.456769 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:09:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:09:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:09:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:09:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:09:16.472427 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:09:22.069674 543705 disk_info.go:125] begin check local disk info of client
I0320 02:09:22.072062 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:09:22.072068 543705 disk_info.go:196] parse disk info done, disk is : [0xc000394200 0xc000394240]
E0320 02:09:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:09:23.409794 543705 memory.go:184] no items to output this cycle
I0320 02:09:23.409805 543705 cpu.go:275] no items to output this cycle
E0320 02:09:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:09:33.409765 543705 memory.go:184] no items to output this cycle
I0320 02:09:33.409801 543705 cpu.go:275] no items to output this cycle
I0320 02:09:38.044915 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 02:09:38.044923 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 02:09:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:09:43.410618 543705 memory.go:191] Add success.
I0320 02:09:43.409822 543705 cpu.go:282] Add success.
I0320 02:09:43.420411 543705 net.go:648] Add success.
I0320 02:09:43.423057 543705 net.go:770] primary dev: ETH0
I0320 02:09:43.423070 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:09:43.423083 543705 net.go:698] Add success.
I0320 02:09:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:09:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:09:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:09:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:09:53.409785 543705 memory.go:184] no items to output this cycle
I0320 02:09:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 02:10:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:10:03.409775 543705 memory.go:184] no items to output this cycle
I0320 02:10:03.409781 543705 cpu.go:275] no items to output this cycle
E0320 02:10:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:10:13.409792 543705 memory.go:191] Add success.
I0320 02:10:13.409794 543705 cpu.go:282] Add success.
W0320 02:10:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:10:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:10:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:10:13.420052 543705 net.go:648] Add success.
I0320 02:10:13.422852 543705 net.go:770] primary dev: ETH0
I0320 02:10:13.422867 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:10:13.422882 543705 net.go:698] Add success.
I0320 02:10:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:10:14.455180 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:10:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0320 02:10:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:10:14.456587 543705 disk_worker.go:494] system disk:vda1
I0320 02:10:14.456619 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:10:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:10:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:10:16.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:10:16.458083 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:10:16.472437 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:10:22.073672 543705 disk_info.go:125] begin check local disk info of client
I0320 02:10:22.076131 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:10:22.076138 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002ae540 0xc0002ae580]
E0320 02:10:23.409740 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:10:23.409754 543705 memory.go:184] no items to output this cycle
I0320 02:10:23.409791 543705 cpu.go:275] no items to output this cycle
E0320 02:10:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:10:33.409802 543705 memory.go:184] no items to output this cycle
I0320 02:10:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 02:10:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:10:43.409778 543705 memory.go:191] Add success.
I0320 02:10:43.409802 543705 cpu.go:282] Add success.
I0320 02:10:43.419954 543705 net.go:648] Add success.
I0320 02:10:43.422700 543705 net.go:770] primary dev: ETH0
I0320 02:10:43.422716 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:10:43.422729 543705 net.go:698] Add success.
I0320 02:10:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:10:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:10:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:10:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:10:53.409790 543705 cpu.go:275] no items to output this cycle
I0320 02:10:53.409801 543705 memory.go:184] no items to output this cycle
E0320 02:11:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:11:03.409767 543705 memory.go:184] no items to output this cycle
I0320 02:11:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 02:11:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:11:13.409794 543705 memory.go:191] Add success.
I0320 02:11:13.409813 543705 cpu.go:282] Add success.
W0320 02:11:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:11:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:11:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:11:13.420144 543705 net.go:648] Add success.
I0320 02:11:13.423068 543705 net.go:770] primary dev: ETH0
I0320 02:11:13.423080 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:11:13.423092 543705 net.go:698] Add success.
I0320 02:11:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:11:14.455188 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:11:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 02:11:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:11:14.456618 543705 disk_worker.go:494] system disk:vda1
I0320 02:11:14.456650 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:11:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:11:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:11:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:11:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:11:16.472375 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:11:22.077664 543705 disk_info.go:125] begin check local disk info of client
I0320 02:11:22.080047 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:11:22.080054 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5d40 0xc0000c5d80]
E0320 02:11:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:11:23.409773 543705 memory.go:184] no items to output this cycle
I0320 02:11:23.409772 543705 cpu.go:275] no items to output this cycle
E0320 02:11:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:11:33.409779 543705 memory.go:184] no items to output this cycle
I0320 02:11:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 02:11:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:11:43.409791 543705 memory.go:191] Add success.
I0320 02:11:43.409792 543705 cpu.go:282] Add success.
I0320 02:11:43.419941 543705 net.go:648] Add success.
I0320 02:11:43.422805 543705 net.go:770] primary dev: ETH0
I0320 02:11:43.422818 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:11:43.422830 543705 net.go:698] Add success.
I0320 02:11:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:11:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:11:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:11:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:11:53.409776 543705 memory.go:184] no items to output this cycle
I0320 02:11:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 02:12:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:12:03.409772 543705 memory.go:184] no items to output this cycle
I0320 02:12:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 02:12:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:12:13.409785 543705 memory.go:191] Add success.
I0320 02:12:13.409803 543705 cpu.go:282] Add success.
W0320 02:12:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:12:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:12:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:12:13.420216 543705 net.go:648] Add success.
I0320 02:12:13.422868 543705 net.go:770] primary dev: ETH0
I0320 02:12:13.422881 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:12:13.422893 543705 net.go:698] Add success.
I0320 02:12:13.469267 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1955f021-d33f-412e-a561-cb4b0a730994","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 02:12:13.469300 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 02:12:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:12:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 02:12:14.455187 543705 disk_worker.go:728] disk inode is not compliant
E0320 02:12:14.457008 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 02:12:14.457017 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 02:12:14.457023 543705 custom_config.go:64] query custom config with name: gpu
I0320 02:12:14.457042 543705 disk_worker.go:494] system disk:vda1
I0320 02:12:14.457081 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 02:12:15.456938 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 02:12:15.456958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:12:16.458006 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 02:12:16.458006 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 02:12:16.458063 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:12:16.458083 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:12:16.472428 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:12:22.081676 543705 disk_info.go:125] begin check local disk info of client
I0320 02:12:22.084108 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:12:22.084114 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003420c0 0xc000342100]
E0320 02:12:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:12:23.409790 543705 memory.go:184] no items to output this cycle
I0320 02:12:23.409802 543705 cpu.go:275] no items to output this cycle
E0320 02:12:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:12:33.409781 543705 memory.go:184] no items to output this cycle
I0320 02:12:33.409793 543705 cpu.go:275] no items to output this cycle
I0320 02:12:38.045728 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 02:12:38.045734 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 02:12:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:12:43.410648 543705 memory.go:191] Add success.
I0320 02:12:43.409794 543705 cpu.go:282] Add success.
I0320 02:12:43.420379 543705 net.go:648] Add success.
I0320 02:12:43.423101 543705 net.go:770] primary dev: ETH0
I0320 02:12:43.423114 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:12:43.423126 543705 net.go:698] Add success.
I0320 02:12:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:12:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:12:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:12:53.410275 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:12:53.410298 543705 memory.go:184] no items to output this cycle
I0320 02:12:53.410307 543705 cpu.go:275] no items to output this cycle
E0320 02:13:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:13:03.409781 543705 cpu.go:275] no items to output this cycle
I0320 02:13:03.409783 543705 memory.go:184] no items to output this cycle
E0320 02:13:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:13:13.409820 543705 memory.go:191] Add success.
I0320 02:13:13.409830 543705 cpu.go:282] Add success.
W0320 02:13:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:13:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:13:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:13:13.420198 543705 net.go:648] Add success.
I0320 02:13:13.422859 543705 net.go:770] primary dev: ETH0
I0320 02:13:13.422872 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:13:13.422885 543705 net.go:698] Add success.
I0320 02:13:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:13:14.455119 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:13:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 02:13:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:13:14.456621 543705 disk_worker.go:494] system disk:vda1
I0320 02:13:14.456652 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:13:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:13:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:13:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:13:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:13:16.472385 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:13:22.085675 543705 disk_info.go:125] begin check local disk info of client
I0320 02:13:22.088074 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:13:22.088080 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003fea80 0xc0003feac0]
E0320 02:13:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:13:23.409785 543705 memory.go:184] no items to output this cycle
I0320 02:13:23.409798 543705 cpu.go:275] no items to output this cycle
E0320 02:13:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:13:33.409774 543705 memory.go:184] no items to output this cycle
I0320 02:13:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 02:13:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:13:43.409807 543705 memory.go:191] Add success.
I0320 02:13:43.409815 543705 cpu.go:282] Add success.
I0320 02:13:43.419865 543705 net.go:648] Add success.
I0320 02:13:43.422790 543705 net.go:770] primary dev: ETH0
I0320 02:13:43.422803 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:13:43.422817 543705 net.go:698] Add success.
I0320 02:13:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:13:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:13:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:13:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:13:53.409803 543705 memory.go:184] no items to output this cycle
I0320 02:13:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 02:14:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:14:03.409782 543705 memory.go:184] no items to output this cycle
I0320 02:14:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 02:14:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:14:13.409788 543705 memory.go:191] Add success.
I0320 02:14:13.409790 543705 cpu.go:282] Add success.
W0320 02:14:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:14:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:14:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:14:13.420062 543705 net.go:648] Add success.
I0320 02:14:13.422697 543705 net.go:770] primary dev: ETH0
I0320 02:14:13.422709 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:14:13.422721 543705 net.go:698] Add success.
I0320 02:14:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:14:14.455373 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:14:14.455382 543705 disk_worker.go:708] disk space is not compliant
W0320 02:14:14.455389 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:14:14.457510 543705 disk_worker.go:494] system disk:vda1
I0320 02:14:14.457538 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:14:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:14:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:14:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:14:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:14:16.472423 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:14:22.089677 543705 disk_info.go:125] begin check local disk info of client
I0320 02:14:22.092159 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:14:22.092164 543705 disk_info.go:196] parse disk info done, disk is : [0xc000542340 0xc000542380]
E0320 02:14:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:14:23.409766 543705 memory.go:184] no items to output this cycle
I0320 02:14:23.409785 543705 cpu.go:275] no items to output this cycle
E0320 02:14:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:14:33.409796 543705 memory.go:184] no items to output this cycle
I0320 02:14:33.409807 543705 cpu.go:275] no items to output this cycle
E0320 02:14:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:14:43.409785 543705 memory.go:191] Add success.
I0320 02:14:43.409806 543705 cpu.go:282] Add success.
I0320 02:14:43.419917 543705 net.go:648] Add success.
I0320 02:14:43.422716 543705 net.go:770] primary dev: ETH0
I0320 02:14:43.422729 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:14:43.422743 543705 net.go:698] Add success.
I0320 02:14:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:14:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:14:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:14:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:14:53.409793 543705 memory.go:184] no items to output this cycle
I0320 02:14:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 02:15:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:15:03.409764 543705 memory.go:184] no items to output this cycle
I0320 02:15:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 02:15:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:15:13.409816 543705 memory.go:191] Add success.
I0320 02:15:13.409826 543705 cpu.go:282] Add success.
W0320 02:15:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:15:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:15:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:15:13.420250 543705 net.go:648] Add success.
I0320 02:15:13.423042 543705 net.go:770] primary dev: ETH0
I0320 02:15:13.423054 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:15:13.423072 543705 net.go:698] Add success.
I0320 02:15:13.552046 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e38996bd-f6d6-416a-b5d0-0545710e58a5","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 02:15:13.552078 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 02:15:14.454955 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:15:14.455108 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:15:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0320 02:15:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:15:14.456496 543705 disk_worker.go:494] system disk:vda1
I0320 02:15:14.456541 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:15:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:15:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:15:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:15:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:15:16.472435 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:15:22.093671 543705 disk_info.go:125] begin check local disk info of client
I0320 02:15:22.096120 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:15:22.096126 543705 disk_info.go:196] parse disk info done, disk is : [0xc00025c640 0xc00025c680]
E0320 02:15:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:15:23.409783 543705 memory.go:184] no items to output this cycle
I0320 02:15:23.409796 543705 cpu.go:275] no items to output this cycle
E0320 02:15:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:15:33.409770 543705 memory.go:184] no items to output this cycle
I0320 02:15:33.409791 543705 cpu.go:275] no items to output this cycle
I0320 02:15:38.045870 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 02:15:38.045877 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 02:15:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:15:43.410575 543705 memory.go:191] Add success.
I0320 02:15:43.409804 543705 cpu.go:282] Add success.
I0320 02:15:43.420345 543705 net.go:648] Add success.
I0320 02:15:43.422768 543705 net.go:770] primary dev: ETH0
I0320 02:15:43.422781 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:15:43.422794 543705 net.go:698] Add success.
I0320 02:15:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:15:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:15:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:15:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:15:53.409775 543705 memory.go:184] no items to output this cycle
I0320 02:15:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 02:16:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:16:03.409764 543705 memory.go:184] no items to output this cycle
I0320 02:16:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 02:16:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:16:13.409786 543705 memory.go:191] Add success.
W0320 02:16:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:16:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:16:13.409822 543705 cpu.go:282] Add success.
I0320 02:16:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:16:13.420109 543705 net.go:648] Add success.
I0320 02:16:13.423197 543705 net.go:770] primary dev: ETH0
I0320 02:16:13.423212 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:16:13.423226 543705 net.go:698] Add success.
I0320 02:16:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:16:14.455197 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:16:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0320 02:16:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:16:14.456784 543705 disk_worker.go:494] system disk:vda1
I0320 02:16:14.456812 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:16:15.455979 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:16:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:16:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:16:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:16:16.472465 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:16:22.097673 543705 disk_info.go:125] begin check local disk info of client
I0320 02:16:22.100179 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:16:22.100186 543705 disk_info.go:196] parse disk info done, disk is : [0xc000376440 0xc000376480]
E0320 02:16:23.409739 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:16:23.409753 543705 memory.go:184] no items to output this cycle
I0320 02:16:23.409801 543705 cpu.go:275] no items to output this cycle
E0320 02:16:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:16:33.409796 543705 memory.go:184] no items to output this cycle
I0320 02:16:33.409807 543705 cpu.go:275] no items to output this cycle
E0320 02:16:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:16:43.409785 543705 memory.go:191] Add success.
I0320 02:16:43.409803 543705 cpu.go:282] Add success.
I0320 02:16:43.419844 543705 net.go:648] Add success.
I0320 02:16:43.422577 543705 net.go:770] primary dev: ETH0
I0320 02:16:43.422589 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:16:43.422601 543705 net.go:698] Add success.
I0320 02:16:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:16:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:16:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:16:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:16:53.409773 543705 memory.go:184] no items to output this cycle
I0320 02:16:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 02:17:03.409878 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:17:03.409895 543705 cpu.go:275] no items to output this cycle
I0320 02:17:03.409899 543705 memory.go:184] no items to output this cycle
E0320 02:17:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:17:13.409821 543705 memory.go:191] Add success.
I0320 02:17:13.409831 543705 cpu.go:282] Add success.
W0320 02:17:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:17:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:17:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:17:13.420206 543705 net.go:648] Add success.
I0320 02:17:13.422936 543705 net.go:770] primary dev: ETH0
I0320 02:17:13.422949 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:17:13.422961 543705 net.go:698] Add success.
I0320 02:17:13.453613 543705 event_worker.go:152] Polling the log file for events...
W0320 02:17:14.455160 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:17:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 02:17:14.455173 543705 disk_worker.go:728] disk inode is not compliant
E0320 02:17:14.456955 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 02:17:14.456964 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 02:17:14.456971 543705 custom_config.go:64] query custom config with name: gpu
I0320 02:17:14.457023 543705 disk_worker.go:494] system disk:vda1
I0320 02:17:14.457051 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 02:17:15.456827 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 02:17:15.456835 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:17:16.457923 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 02:17:16.457923 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 02:17:16.457978 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:17:16.457998 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:17:16.472394 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:17:22.101671 543705 disk_info.go:125] begin check local disk info of client
I0320 02:17:22.104097 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:17:22.104104 543705 disk_info.go:196] parse disk info done, disk is : [0xc000467800 0xc000467840]
E0320 02:17:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:17:23.409764 543705 memory.go:184] no items to output this cycle
I0320 02:17:23.409798 543705 cpu.go:275] no items to output this cycle
E0320 02:17:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:17:33.409803 543705 memory.go:184] no items to output this cycle
I0320 02:17:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 02:17:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:17:43.409791 543705 memory.go:191] Add success.
I0320 02:17:43.409821 543705 cpu.go:282] Add success.
I0320 02:17:43.419876 543705 net.go:648] Add success.
I0320 02:17:43.422639 543705 net.go:770] primary dev: ETH0
I0320 02:17:43.422661 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:17:43.422675 543705 net.go:698] Add success.
I0320 02:17:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:17:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:17:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:17:53.409809 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:17:53.409831 543705 memory.go:184] no items to output this cycle
I0320 02:17:53.409835 543705 cpu.go:275] no items to output this cycle
E0320 02:18:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:18:03.409785 543705 cpu.go:275] no items to output this cycle
I0320 02:18:03.409798 543705 memory.go:184] no items to output this cycle
E0320 02:18:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:18:13.409793 543705 memory.go:191] Add success.
I0320 02:18:13.409810 543705 cpu.go:282] Add success.
W0320 02:18:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:18:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:18:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:18:13.420120 543705 net.go:648] Add success.
I0320 02:18:13.422694 543705 net.go:770] primary dev: ETH0
I0320 02:18:13.422707 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:18:13.422719 543705 net.go:698] Add success.
I0320 02:18:13.468290 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e7d5216b-105f-445a-9a7a-70d9799beb30","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 02:18:13.468323 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 02:18:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:18:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:18:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 02:18:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:18:14.456590 543705 disk_worker.go:494] system disk:vda1
I0320 02:18:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:18:15.455989 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:18:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:18:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:18:16.458074 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:18:16.472534 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:18:22.105674 543705 disk_info.go:125] begin check local disk info of client
I0320 02:18:22.108221 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:18:22.108227 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b3d40 0xc0003b3d80]
E0320 02:18:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:18:23.409768 543705 memory.go:184] no items to output this cycle
I0320 02:18:23.409774 543705 cpu.go:275] no items to output this cycle
E0320 02:18:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:18:33.409815 543705 memory.go:184] no items to output this cycle
I0320 02:18:33.409825 543705 cpu.go:275] no items to output this cycle
I0320 02:18:38.046024 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 02:18:38.046031 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 02:18:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:18:43.410667 543705 memory.go:191] Add success.
I0320 02:18:43.409830 543705 cpu.go:282] Add success.
I0320 02:18:43.420357 543705 net.go:648] Add success.
I0320 02:18:43.423187 543705 net.go:770] primary dev: ETH0
I0320 02:18:43.423200 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:18:43.423212 543705 net.go:698] Add success.
I0320 02:18:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:18:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:18:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:18:53.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:18:53.409822 543705 memory.go:184] no items to output this cycle
I0320 02:18:53.409831 543705 cpu.go:275] no items to output this cycle
E0320 02:19:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:19:03.409795 543705 memory.go:184] no items to output this cycle
I0320 02:19:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 02:19:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:19:13.409807 543705 cpu.go:282] Add success.
I0320 02:19:13.409820 543705 memory.go:191] Add success.
W0320 02:19:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:19:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:19:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:19:13.420128 543705 net.go:648] Add success.
I0320 02:19:13.422873 543705 net.go:770] primary dev: ETH0
I0320 02:19:13.422889 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:19:13.422904 543705 net.go:698] Add success.
I0320 02:19:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:19:14.455138 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:19:14.455149 543705 disk_worker.go:708] disk space is not compliant
W0320 02:19:14.455152 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:19:14.456484 543705 disk_worker.go:494] system disk:vda1
I0320 02:19:14.456530 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:19:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:19:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:19:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:19:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:19:16.472416 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:19:22.109676 543705 disk_info.go:125] begin check local disk info of client
I0320 02:19:22.112098 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:19:22.112104 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b1b00 0xc0002b1b40]
E0320 02:19:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:19:23.409787 543705 memory.go:184] no items to output this cycle
I0320 02:19:23.409802 543705 cpu.go:275] no items to output this cycle
E0320 02:19:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:19:33.409777 543705 memory.go:184] no items to output this cycle
I0320 02:19:33.409782 543705 cpu.go:275] no items to output this cycle
E0320 02:19:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:19:43.409796 543705 memory.go:191] Add success.
I0320 02:19:43.409797 543705 cpu.go:282] Add success.
I0320 02:19:43.419909 543705 net.go:648] Add success.
I0320 02:19:43.422695 543705 net.go:770] primary dev: ETH0
I0320 02:19:43.422709 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:19:43.422724 543705 net.go:698] Add success.
I0320 02:19:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:19:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:19:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:19:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:19:53.409776 543705 memory.go:184] no items to output this cycle
I0320 02:19:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 02:20:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:20:03.409769 543705 memory.go:184] no items to output this cycle
I0320 02:20:03.409889 543705 cpu.go:275] no items to output this cycle
E0320 02:20:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:20:13.409791 543705 memory.go:191] Add success.
I0320 02:20:13.409792 543705 cpu.go:282] Add success.
W0320 02:20:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:20:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:20:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:20:13.420084 543705 net.go:648] Add success.
I0320 02:20:13.422790 543705 net.go:770] primary dev: ETH0
I0320 02:20:13.422804 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:20:13.422817 543705 net.go:698] Add success.
I0320 02:20:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:20:14.455102 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:20:14.455164 543705 disk_worker.go:708] disk space is not compliant
W0320 02:20:14.455167 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:20:14.456496 543705 disk_worker.go:494] system disk:vda1
I0320 02:20:14.456539 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:20:15.455979 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:20:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:20:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:20:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:20:16.472419 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:20:22.113673 543705 disk_info.go:125] begin check local disk info of client
I0320 02:20:22.116176 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:20:22.116182 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b080 0xc00007b0c0]
E0320 02:20:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:20:23.409759 543705 memory.go:184] no items to output this cycle
I0320 02:20:23.409788 543705 cpu.go:275] no items to output this cycle
E0320 02:20:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:20:33.409776 543705 cpu.go:275] no items to output this cycle
I0320 02:20:33.409781 543705 memory.go:184] no items to output this cycle
E0320 02:20:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:20:43.409818 543705 memory.go:191] Add success.
I0320 02:20:43.409821 543705 cpu.go:282] Add success.
I0320 02:20:43.420196 543705 net.go:648] Add success.
I0320 02:20:43.423068 543705 net.go:770] primary dev: ETH0
I0320 02:20:43.423081 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:20:43.423094 543705 net.go:698] Add success.
I0320 02:20:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:20:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:20:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:20:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:20:53.409788 543705 memory.go:184] no items to output this cycle
I0320 02:20:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 02:21:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:21:03.409866 543705 memory.go:184] no items to output this cycle
I0320 02:21:03.409949 543705 cpu.go:275] no items to output this cycle
E0320 02:21:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:21:13.409798 543705 memory.go:191] Add success.
I0320 02:21:13.409802 543705 cpu.go:282] Add success.
W0320 02:21:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:21:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:21:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:21:13.420288 543705 net.go:648] Add success.
I0320 02:21:13.423073 543705 net.go:770] primary dev: ETH0
I0320 02:21:13.423086 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:21:13.423098 543705 net.go:698] Add success.
I0320 02:21:13.700343 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"bc830f46-6160-492f-ac3a-ef8def47e51d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 02:21:13.700381 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 02:21:14.453978 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:21:14.454236 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:21:14.454247 543705 disk_worker.go:708] disk space is not compliant
W0320 02:21:14.454250 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:21:14.455797 543705 disk_worker.go:494] system disk:vda1
I0320 02:21:14.455829 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:21:15.455980 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:21:16.457579 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:21:16.457676 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:21:16.457719 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:21:16.472371 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:21:22.117675 543705 disk_info.go:125] begin check local disk info of client
I0320 02:21:22.120191 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:21:22.120198 543705 disk_info.go:196] parse disk info done, disk is : [0xc000466cc0 0xc000466d00]
E0320 02:21:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:21:23.409792 543705 memory.go:184] no items to output this cycle
I0320 02:21:23.409805 543705 cpu.go:275] no items to output this cycle
E0320 02:21:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:21:33.409785 543705 memory.go:184] no items to output this cycle
I0320 02:21:33.409787 543705 cpu.go:275] no items to output this cycle
I0320 02:21:38.048952 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 02:21:38.048959 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 02:21:43.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:21:43.410687 543705 memory.go:191] Add success.
I0320 02:21:43.409837 543705 cpu.go:282] Add success.
I0320 02:21:43.420392 543705 net.go:648] Add success.
I0320 02:21:43.423308 543705 net.go:770] primary dev: ETH0
I0320 02:21:43.423320 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:21:43.423332 543705 net.go:698] Add success.
I0320 02:21:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:21:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:21:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:21:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:21:53.409782 543705 memory.go:184] no items to output this cycle
I0320 02:21:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 02:22:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:22:03.409779 543705 memory.go:184] no items to output this cycle
I0320 02:22:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 02:22:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:22:13.409809 543705 memory.go:191] Add success.
I0320 02:22:13.409817 543705 cpu.go:282] Add success.
W0320 02:22:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:22:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:22:13.409859 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:22:13.420235 543705 net.go:648] Add success.
I0320 02:22:13.422883 543705 net.go:770] primary dev: ETH0
I0320 02:22:13.422896 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:22:13.422917 543705 net.go:698] Add success.
W0320 02:22:14.455100 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:22:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0320 02:22:14.455164 543705 disk_worker.go:728] disk inode is not compliant
E0320 02:22:14.456885 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 02:22:14.456894 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 02:22:14.456900 543705 custom_config.go:64] query custom config with name: gpu
I0320 02:22:14.456989 543705 disk_worker.go:494] system disk:vda1
I0320 02:22:14.457031 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 02:22:15.456806 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 02:22:15.456817 543705 custom_config.go:64] query custom config with name: huawei_npu
E0320 02:22:16.457984 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 02:22:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:22:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:22:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:22:16.472457 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:22:22.121674 543705 disk_info.go:125] begin check local disk info of client
I0320 02:22:22.124133 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:22:22.124139 543705 disk_info.go:196] parse disk info done, disk is : [0xc000376080 0xc0003760c0]
E0320 02:22:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:22:23.409787 543705 memory.go:184] no items to output this cycle
I0320 02:22:23.409799 543705 cpu.go:275] no items to output this cycle
E0320 02:22:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:22:33.409773 543705 memory.go:184] no items to output this cycle
I0320 02:22:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 02:22:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:22:43.409795 543705 memory.go:191] Add success.
I0320 02:22:43.409796 543705 cpu.go:282] Add success.
I0320 02:22:43.419865 543705 net.go:648] Add success.
I0320 02:22:43.422534 543705 net.go:770] primary dev: ETH0
I0320 02:22:43.422546 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:22:43.422560 543705 net.go:698] Add success.
I0320 02:22:46.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:22:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:22:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:22:53.410265 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:22:53.410285 543705 memory.go:184] no items to output this cycle
I0320 02:22:53.410295 543705 cpu.go:275] no items to output this cycle
E0320 02:23:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:23:03.409901 543705 memory.go:184] no items to output this cycle
I0320 02:23:03.409945 543705 cpu.go:275] no items to output this cycle
E0320 02:23:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:23:13.409809 543705 memory.go:191] Add success.
I0320 02:23:13.409811 543705 cpu.go:282] Add success.
W0320 02:23:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:23:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:23:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:23:13.420060 543705 net.go:770] primary dev: ETH0
I0320 02:23:13.420074 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:23:13.420086 543705 net.go:698] Add success.
I0320 02:23:13.420326 543705 net.go:648] Add success.
I0320 02:23:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:23:14.455096 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:23:14.455160 543705 disk_worker.go:708] disk space is not compliant
W0320 02:23:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:23:14.456488 543705 disk_worker.go:494] system disk:vda1
I0320 02:23:14.456534 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:23:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:23:16.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:23:16.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:23:16.458078 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:23:16.472454 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:23:22.125674 543705 disk_info.go:125] begin check local disk info of client
I0320 02:23:22.128083 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:23:22.128088 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039e900 0xc00039e940]
I0320 02:23:23.409778 543705 cpu.go:275] no items to output this cycle
E0320 02:23:23.409853 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:23:23.409864 543705 memory.go:184] no items to output this cycle
E0320 02:23:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:23:33.409784 543705 cpu.go:275] no items to output this cycle
I0320 02:23:33.409786 543705 memory.go:184] no items to output this cycle
E0320 02:23:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:23:43.409820 543705 memory.go:191] Add success.
I0320 02:23:43.409824 543705 cpu.go:282] Add success.
I0320 02:23:43.419878 543705 net.go:648] Add success.
I0320 02:23:43.422731 543705 net.go:770] primary dev: ETH0
I0320 02:23:43.422744 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:23:43.422757 543705 net.go:698] Add success.
I0320 02:23:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:23:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:23:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:23:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:23:53.409782 543705 memory.go:184] no items to output this cycle
I0320 02:23:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 02:24:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:24:03.409786 543705 memory.go:184] no items to output this cycle
I0320 02:24:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 02:24:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:24:13.409789 543705 memory.go:191] Add success.
I0320 02:24:13.409806 543705 cpu.go:282] Add success.
W0320 02:24:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:24:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:24:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:24:13.420139 543705 net.go:648] Add success.
I0320 02:24:13.422805 543705 net.go:770] primary dev: ETH0
I0320 02:24:13.422819 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:24:13.422832 543705 net.go:698] Add success.
I0320 02:24:13.468854 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7801f2d0-052f-481d-9678-7087b3e8e79f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 02:24:13.468888 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 02:24:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:24:14.455185 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:24:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 02:24:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:24:14.456703 543705 disk_worker.go:494] system disk:vda1
I0320 02:24:14.456745 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:24:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:24:16.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:24:16.458062 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:24:16.458090 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:24:16.472482 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:24:22.129673 543705 disk_info.go:125] begin check local disk info of client
I0320 02:24:22.132194 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:24:22.132201 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa680 0xc0001aa6c0]
E0320 02:24:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:24:23.409769 543705 memory.go:184] no items to output this cycle
I0320 02:24:23.409776 543705 cpu.go:275] no items to output this cycle
E0320 02:24:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:24:33.409769 543705 memory.go:184] no items to output this cycle
I0320 02:24:33.409809 543705 cpu.go:275] no items to output this cycle
I0320 02:24:38.049741 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 02:24:38.049748 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 02:24:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:24:43.410693 543705 memory.go:191] Add success.
I0320 02:24:43.409828 543705 cpu.go:282] Add success.
I0320 02:24:43.420466 543705 net.go:648] Add success.
I0320 02:24:43.423247 543705 net.go:770] primary dev: ETH0
I0320 02:24:43.423262 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:24:43.423277 543705 net.go:698] Add success.
I0320 02:24:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:24:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:24:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:24:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:24:53.409778 543705 memory.go:184] no items to output this cycle
I0320 02:24:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 02:25:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:25:03.409772 543705 memory.go:184] no items to output this cycle
I0320 02:25:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 02:25:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:25:13.409831 543705 memory.go:191] Add success.
I0320 02:25:13.409839 543705 cpu.go:282] Add success.
W0320 02:25:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:25:13.409879 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:25:13.409883 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:25:13.420301 543705 net.go:648] Add success.
I0320 02:25:13.422822 543705 net.go:770] primary dev: ETH0
I0320 02:25:13.422841 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:25:13.422856 543705 net.go:698] Add success.
I0320 02:25:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:25:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:25:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 02:25:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:25:14.456556 543705 disk_worker.go:494] system disk:vda1
I0320 02:25:14.456585 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:25:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:25:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:25:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:25:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:25:16.472409 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:25:22.133672 543705 disk_info.go:125] begin check local disk info of client
I0320 02:25:22.136119 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:25:22.136124 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003771c0 0xc000377200]
E0320 02:25:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:25:23.409764 543705 memory.go:184] no items to output this cycle
I0320 02:25:23.409784 543705 cpu.go:275] no items to output this cycle
E0320 02:25:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:25:33.409778 543705 cpu.go:275] no items to output this cycle
I0320 02:25:33.409782 543705 memory.go:184] no items to output this cycle
E0320 02:25:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:25:43.409820 543705 memory.go:191] Add success.
I0320 02:25:43.409830 543705 cpu.go:282] Add success.
I0320 02:25:43.419893 543705 net.go:648] Add success.
I0320 02:25:43.422767 543705 net.go:770] primary dev: ETH0
I0320 02:25:43.422781 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:25:43.422792 543705 net.go:698] Add success.
I0320 02:25:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:25:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:25:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:25:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:25:53.409808 543705 memory.go:184] no items to output this cycle
I0320 02:25:53.409817 543705 cpu.go:275] no items to output this cycle
E0320 02:26:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:26:03.409796 543705 memory.go:184] no items to output this cycle
I0320 02:26:03.409808 543705 cpu.go:275] no items to output this cycle
E0320 02:26:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:26:13.409791 543705 cpu.go:282] Add success.
I0320 02:26:13.409792 543705 memory.go:191] Add success.
W0320 02:26:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:26:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:26:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:26:13.420329 543705 net.go:648] Add success.
I0320 02:26:13.423054 543705 net.go:770] primary dev: ETH0
I0320 02:26:13.423068 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:26:13.423079 543705 net.go:698] Add success.
I0320 02:26:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:26:14.455097 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:26:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0320 02:26:14.455161 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:26:14.456520 543705 disk_worker.go:494] system disk:vda1
I0320 02:26:14.456563 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:26:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:26:16.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:26:16.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:26:16.458082 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:26:16.472435 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:26:22.137679 543705 disk_info.go:125] begin check local disk info of client
I0320 02:26:22.140178 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:26:22.140184 543705 disk_info.go:196] parse disk info done, disk is : [0xc000582a40 0xc000582a80]
E0320 02:26:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:26:23.409790 543705 memory.go:184] no items to output this cycle
I0320 02:26:23.409804 543705 cpu.go:275] no items to output this cycle
E0320 02:26:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:26:33.409787 543705 cpu.go:275] no items to output this cycle
I0320 02:26:33.409790 543705 memory.go:184] no items to output this cycle
E0320 02:26:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:26:43.409798 543705 memory.go:191] Add success.
I0320 02:26:43.409800 543705 cpu.go:282] Add success.
I0320 02:26:43.419957 543705 net.go:648] Add success.
I0320 02:26:43.422870 543705 net.go:770] primary dev: ETH0
I0320 02:26:43.422883 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:26:43.422896 543705 net.go:698] Add success.
I0320 02:26:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:26:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:26:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:26:53.410232 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:26:53.410256 543705 memory.go:184] no items to output this cycle
I0320 02:26:53.410294 543705 cpu.go:275] no items to output this cycle
E0320 02:27:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:27:03.409776 543705 memory.go:184] no items to output this cycle
I0320 02:27:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 02:27:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:27:13.409801 543705 memory.go:191] Add success.
I0320 02:27:13.409802 543705 cpu.go:282] Add success.
W0320 02:27:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:27:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:27:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:27:13.420316 543705 net.go:648] Add success.
I0320 02:27:13.422973 543705 net.go:770] primary dev: ETH0
I0320 02:27:13.422987 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:27:13.422998 543705 net.go:698] Add success.
I0320 02:27:13.429667 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 02:27:13.452768 543705 event_worker.go:152] Polling the log file for events...
I0320 02:27:13.467956 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"20f824ee-95aa-4f7e-9183-7767be20eea7","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 02:27:13.467989 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 02:27:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:27:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 02:27:14.455192 543705 disk_worker.go:728] disk inode is not compliant
E0320 02:27:14.455911 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 02:27:14.455920 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 02:27:14.455925 543705 custom_config.go:64] query custom config with name: gpu
I0320 02:27:14.456582 543705 disk_worker.go:494] system disk:vda1
I0320 02:27:14.456610 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 02:27:15.456799 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 02:27:15.456808 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:27:16.457931 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 02:27:16.457931 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 02:27:16.457986 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:27:16.458004 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:27:16.472329 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:27:22.141673 543705 disk_info.go:125] begin check local disk info of client
I0320 02:27:22.144095 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:27:22.144101 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a7800 0xc0004a7840]
E0320 02:27:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:27:23.409775 543705 memory.go:184] no items to output this cycle
I0320 02:27:23.409780 543705 cpu.go:275] no items to output this cycle
E0320 02:27:33.409869 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:27:33.409889 543705 memory.go:184] no items to output this cycle
I0320 02:27:33.409901 543705 cpu.go:275] no items to output this cycle
I0320 02:27:38.052948 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 02:27:38.052956 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 02:27:43.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:27:43.410639 543705 memory.go:191] Add success.
I0320 02:27:43.409821 543705 cpu.go:282] Add success.
I0320 02:27:43.420349 543705 net.go:648] Add success.
I0320 02:27:43.423004 543705 net.go:770] primary dev: ETH0
I0320 02:27:43.423017 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:27:43.423029 543705 net.go:698] Add success.
I0320 02:27:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:27:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:27:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:27:53.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:27:53.409812 543705 memory.go:184] no items to output this cycle
I0320 02:27:53.409822 543705 cpu.go:275] no items to output this cycle
E0320 02:28:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:28:03.409789 543705 memory.go:184] no items to output this cycle
I0320 02:28:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 02:28:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:28:13.409785 543705 memory.go:191] Add success.
I0320 02:28:13.409803 543705 cpu.go:282] Add success.
W0320 02:28:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:28:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:28:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:28:13.420370 543705 net.go:648] Add success.
I0320 02:28:13.423190 543705 net.go:770] primary dev: ETH0
I0320 02:28:13.423202 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:28:13.423214 543705 net.go:698] Add success.
I0320 02:28:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:28:14.455091 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:28:14.455156 543705 disk_worker.go:708] disk space is not compliant
W0320 02:28:14.455159 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:28:14.456501 543705 disk_worker.go:494] system disk:vda1
I0320 02:28:14.456544 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:28:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:28:16.457997 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:28:16.458062 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:28:16.458089 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:28:16.472438 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:28:22.145678 543705 disk_info.go:125] begin check local disk info of client
I0320 02:28:22.148169 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:28:22.148174 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a6300 0xc0004a6340]
E0320 02:28:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:28:23.409788 543705 memory.go:184] no items to output this cycle
I0320 02:28:23.409802 543705 cpu.go:275] no items to output this cycle
E0320 02:28:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:28:33.409787 543705 memory.go:184] no items to output this cycle
I0320 02:28:33.409791 543705 cpu.go:275] no items to output this cycle
E0320 02:28:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:28:43.409793 543705 memory.go:191] Add success.
I0320 02:28:43.409794 543705 cpu.go:282] Add success.
I0320 02:28:43.420124 543705 net.go:648] Add success.
I0320 02:28:43.422656 543705 net.go:770] primary dev: ETH0
I0320 02:28:43.422672 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:28:43.422686 543705 net.go:698] Add success.
I0320 02:28:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:28:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:28:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:28:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:28:53.409786 543705 memory.go:184] no items to output this cycle
I0320 02:28:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 02:29:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:29:03.409783 543705 memory.go:184] no items to output this cycle
I0320 02:29:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 02:29:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:29:13.409799 543705 memory.go:191] Add success.
I0320 02:29:13.409799 543705 cpu.go:282] Add success.
W0320 02:29:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:29:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:29:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:29:13.420432 543705 net.go:648] Add success.
I0320 02:29:13.423044 543705 net.go:770] primary dev: ETH0
I0320 02:29:13.423056 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:29:13.423067 543705 net.go:698] Add success.
I0320 02:29:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:29:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:29:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 02:29:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:29:14.456573 543705 disk_worker.go:494] system disk:vda1
I0320 02:29:14.456603 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:29:15.456010 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:29:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:29:16.458064 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:29:16.458090 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:29:16.472526 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:29:22.149675 543705 disk_info.go:125] begin check local disk info of client
I0320 02:29:22.152274 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:29:22.152282 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035fbc0 0xc00035fc00]
E0320 02:29:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:29:23.409758 543705 memory.go:184] no items to output this cycle
I0320 02:29:23.409794 543705 cpu.go:275] no items to output this cycle
E0320 02:29:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:29:33.409803 543705 memory.go:184] no items to output this cycle
I0320 02:29:33.409820 543705 cpu.go:275] no items to output this cycle
E0320 02:29:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:29:43.409798 543705 memory.go:191] Add success.
I0320 02:29:43.409801 543705 cpu.go:282] Add success.
I0320 02:29:43.419914 543705 net.go:648] Add success.
I0320 02:29:43.422437 543705 net.go:770] primary dev: ETH0
I0320 02:29:43.422450 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:29:43.422461 543705 net.go:698] Add success.
I0320 02:29:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:29:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:29:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:29:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:29:53.409779 543705 memory.go:184] no items to output this cycle
I0320 02:29:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 02:30:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:30:03.409766 543705 memory.go:184] no items to output this cycle
I0320 02:30:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 02:30:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:30:13.409792 543705 memory.go:191] Add success.
I0320 02:30:13.409809 543705 cpu.go:282] Add success.
W0320 02:30:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:30:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:30:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:30:13.420117 543705 net.go:648] Add success.
I0320 02:30:13.422836 543705 net.go:770] primary dev: ETH0
I0320 02:30:13.422854 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:30:13.422870 543705 net.go:698] Add success.
I0320 02:30:13.463968 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"af6ac447-2896-4dc6-a8e7-8b62adb60251","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 02:30:13.464001 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 02:30:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:30:14.455202 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:30:14.455213 543705 disk_worker.go:708] disk space is not compliant
W0320 02:30:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:30:14.456620 543705 disk_worker.go:494] system disk:vda1
I0320 02:30:14.456649 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:30:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:30:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:30:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:30:16.458077 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:30:16.472425 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:30:22.153674 543705 disk_info.go:125] begin check local disk info of client
I0320 02:30:22.156184 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:30:22.156190 543705 disk_info.go:196] parse disk info done, disk is : [0xc000352440 0xc000352480]
E0320 02:30:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:30:23.409791 543705 memory.go:184] no items to output this cycle
I0320 02:30:23.409804 543705 cpu.go:275] no items to output this cycle
E0320 02:30:33.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:30:33.409812 543705 memory.go:184] no items to output this cycle
I0320 02:30:33.409825 543705 cpu.go:275] no items to output this cycle
I0320 02:30:38.053731 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 02:30:38.053738 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 02:30:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:30:43.410695 543705 memory.go:191] Add success.
I0320 02:30:43.409783 543705 cpu.go:282] Add success.
I0320 02:30:43.420398 543705 net.go:648] Add success.
I0320 02:30:43.423085 543705 net.go:770] primary dev: ETH0
I0320 02:30:43.423099 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:30:43.423112 543705 net.go:698] Add success.
I0320 02:30:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:30:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:30:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:30:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:30:53.409782 543705 memory.go:184] no items to output this cycle
I0320 02:30:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 02:31:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:31:03.409773 543705 memory.go:184] no items to output this cycle
I0320 02:31:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 02:31:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:31:13.409821 543705 memory.go:191] Add success.
I0320 02:31:13.409826 543705 cpu.go:282] Add success.
W0320 02:31:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:31:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:31:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:31:13.420186 543705 net.go:648] Add success.
I0320 02:31:13.422993 543705 net.go:770] primary dev: ETH0
I0320 02:31:13.423011 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:31:13.423028 543705 net.go:698] Add success.
I0320 02:31:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:31:14.455208 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:31:14.455412 543705 disk_worker.go:708] disk space is not compliant
W0320 02:31:14.455419 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:31:14.456997 543705 disk_worker.go:494] system disk:vda1
I0320 02:31:14.457038 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:31:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:31:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:31:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:31:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:31:16.472406 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:31:22.157674 543705 disk_info.go:125] begin check local disk info of client
I0320 02:31:22.160092 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:31:22.160097 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002be240 0xc0002be280]
E0320 02:31:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:31:23.409784 543705 memory.go:184] no items to output this cycle
I0320 02:31:23.409796 543705 cpu.go:275] no items to output this cycle
E0320 02:31:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:31:33.409801 543705 memory.go:184] no items to output this cycle
I0320 02:31:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 02:31:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:31:43.409791 543705 memory.go:191] Add success.
I0320 02:31:43.409796 543705 cpu.go:282] Add success.
I0320 02:31:43.419888 543705 net.go:648] Add success.
I0320 02:31:43.422575 543705 net.go:770] primary dev: ETH0
I0320 02:31:43.422590 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:31:43.422604 543705 net.go:698] Add success.
I0320 02:31:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:31:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:31:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:31:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:31:53.409789 543705 cpu.go:275] no items to output this cycle
I0320 02:31:53.409804 543705 memory.go:184] no items to output this cycle
E0320 02:32:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:32:03.409803 543705 memory.go:184] no items to output this cycle
I0320 02:32:03.409815 543705 cpu.go:275] no items to output this cycle
E0320 02:32:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:32:13.409791 543705 memory.go:191] Add success.
I0320 02:32:13.409791 543705 cpu.go:282] Add success.
W0320 02:32:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:32:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:32:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:32:13.420206 543705 net.go:648] Add success.
I0320 02:32:13.422994 543705 net.go:770] primary dev: ETH0
I0320 02:32:13.423009 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:32:13.423022 543705 net.go:698] Add success.
W0320 02:32:14.455392 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:32:14.455458 543705 disk_worker.go:708] disk space is not compliant
W0320 02:32:14.455461 543705 disk_worker.go:728] disk inode is not compliant
E0320 02:32:14.456000 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 02:32:14.456007 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 02:32:14.456012 543705 custom_config.go:64] query custom config with name: gpu
I0320 02:32:14.457873 543705 disk_worker.go:494] system disk:vda1
I0320 02:32:14.457913 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 02:32:15.456833 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 02:32:15.456841 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:32:16.458028 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 02:32:16.458036 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 02:32:16.458083 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:32:16.458101 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:32:16.472456 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:32:22.161671 543705 disk_info.go:125] begin check local disk info of client
I0320 02:32:22.164202 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:32:22.164207 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002617c0 0xc000261800]
E0320 02:32:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:32:23.409775 543705 memory.go:184] no items to output this cycle
I0320 02:32:23.409782 543705 cpu.go:275] no items to output this cycle
E0320 02:32:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:32:33.409803 543705 memory.go:184] no items to output this cycle
I0320 02:32:33.409814 543705 cpu.go:275] no items to output this cycle
E0320 02:32:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:32:43.409784 543705 memory.go:191] Add success.
I0320 02:32:43.409787 543705 cpu.go:282] Add success.
I0320 02:32:43.419919 543705 net.go:648] Add success.
I0320 02:32:43.422725 543705 net.go:770] primary dev: ETH0
I0320 02:32:43.422739 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:32:43.422753 543705 net.go:698] Add success.
I0320 02:32:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:32:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:32:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:32:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:32:53.409784 543705 memory.go:184] no items to output this cycle
I0320 02:32:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 02:33:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:33:03.409780 543705 memory.go:184] no items to output this cycle
I0320 02:33:03.409784 543705 cpu.go:275] no items to output this cycle
E0320 02:33:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:33:13.409820 543705 memory.go:191] Add success.
I0320 02:33:13.409828 543705 cpu.go:282] Add success.
W0320 02:33:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:33:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:33:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:33:13.420133 543705 net.go:648] Add success.
I0320 02:33:13.422970 543705 net.go:770] primary dev: ETH0
I0320 02:33:13.422984 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:33:13.422996 543705 net.go:698] Add success.
I0320 02:33:13.469070 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9b9e9d9f-48b8-456e-ba4f-e17d45717e7f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 02:33:13.469105 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 02:33:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:33:14.455385 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:33:14.455396 543705 disk_worker.go:708] disk space is not compliant
W0320 02:33:14.455403 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:33:14.457583 543705 disk_worker.go:494] system disk:vda1
I0320 02:33:14.457625 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:33:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:33:16.458007 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:33:16.458068 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:33:16.458090 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:33:16.472418 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:33:22.165672 543705 disk_info.go:125] begin check local disk info of client
I0320 02:33:22.168163 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:33:22.168169 543705 disk_info.go:196] parse disk info done, disk is : [0xc000518280 0xc0005182c0]
E0320 02:33:23.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:33:23.409761 543705 memory.go:184] no items to output this cycle
I0320 02:33:23.409783 543705 cpu.go:275] no items to output this cycle
E0320 02:33:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:33:33.409782 543705 memory.go:184] no items to output this cycle
I0320 02:33:33.409787 543705 cpu.go:275] no items to output this cycle
I0320 02:33:38.056969 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 02:33:38.056975 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 02:33:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:33:43.410673 543705 memory.go:191] Add success.
I0320 02:33:43.409801 543705 cpu.go:282] Add success.
I0320 02:33:43.420418 543705 net.go:648] Add success.
I0320 02:33:43.423287 543705 net.go:770] primary dev: ETH0
I0320 02:33:43.423299 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:33:43.423312 543705 net.go:698] Add success.
I0320 02:33:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:33:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:33:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:33:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:33:53.409789 543705 memory.go:184] no items to output this cycle
I0320 02:33:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 02:34:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:34:03.409780 543705 memory.go:184] no items to output this cycle
I0320 02:34:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 02:34:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:34:13.409794 543705 cpu.go:282] Add success.
I0320 02:34:13.409795 543705 memory.go:191] Add success.
W0320 02:34:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:34:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:34:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:34:13.420062 543705 net.go:648] Add success.
I0320 02:34:13.422707 543705 net.go:770] primary dev: ETH0
I0320 02:34:13.422725 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:34:13.422740 543705 net.go:698] Add success.
I0320 02:34:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:34:14.455304 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:34:14.455372 543705 disk_worker.go:708] disk space is not compliant
W0320 02:34:14.455376 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:34:14.457553 543705 disk_worker.go:494] system disk:vda1
I0320 02:34:14.457597 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:34:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:34:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:34:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:34:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:34:16.472415 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:34:22.169676 543705 disk_info.go:125] begin check local disk info of client
I0320 02:34:22.172194 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:34:22.172200 543705 disk_info.go:196] parse disk info done, disk is : [0xc000466100 0xc000466140]
E0320 02:34:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:34:23.409766 543705 memory.go:184] no items to output this cycle
I0320 02:34:23.409801 543705 cpu.go:275] no items to output this cycle
E0320 02:34:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:34:33.409806 543705 memory.go:184] no items to output this cycle
I0320 02:34:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 02:34:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:34:43.409779 543705 memory.go:191] Add success.
I0320 02:34:43.409800 543705 cpu.go:282] Add success.
I0320 02:34:43.419879 543705 net.go:648] Add success.
I0320 02:34:43.422625 543705 net.go:770] primary dev: ETH0
I0320 02:34:43.422641 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:34:43.422656 543705 net.go:698] Add success.
I0320 02:34:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:34:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:34:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:34:53.410364 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:34:53.410385 543705 memory.go:184] no items to output this cycle
I0320 02:34:53.410398 543705 cpu.go:275] no items to output this cycle
E0320 02:35:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:35:03.409767 543705 memory.go:184] no items to output this cycle
I0320 02:35:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 02:35:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:35:13.409814 543705 memory.go:191] Add success.
I0320 02:35:13.409821 543705 cpu.go:282] Add success.
W0320 02:35:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:35:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:35:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:35:13.420145 543705 net.go:648] Add success.
I0320 02:35:13.423191 543705 net.go:770] primary dev: ETH0
I0320 02:35:13.423206 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:35:13.423220 543705 net.go:698] Add success.
I0320 02:35:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:35:14.455105 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:35:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 02:35:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:35:14.457573 543705 disk_worker.go:494] system disk:vda1
I0320 02:35:14.457627 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:35:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:35:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:35:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:35:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:35:16.472392 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:35:22.173674 543705 disk_info.go:125] begin check local disk info of client
I0320 02:35:22.176103 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:35:22.176109 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039a580 0xc00039a5c0]
E0320 02:35:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:35:23.409783 543705 memory.go:184] no items to output this cycle
I0320 02:35:23.409797 543705 cpu.go:275] no items to output this cycle
E0320 02:35:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:35:33.409768 543705 memory.go:184] no items to output this cycle
I0320 02:35:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 02:35:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:35:43.409815 543705 memory.go:191] Add success.
I0320 02:35:43.409820 543705 cpu.go:282] Add success.
I0320 02:35:43.419884 543705 net.go:648] Add success.
I0320 02:35:43.422524 543705 net.go:770] primary dev: ETH0
I0320 02:35:43.422537 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:35:43.422548 543705 net.go:698] Add success.
I0320 02:35:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:35:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:35:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:35:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:35:53.409779 543705 memory.go:184] no items to output this cycle
I0320 02:35:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 02:36:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:36:03.409777 543705 memory.go:184] no items to output this cycle
I0320 02:36:03.409784 543705 cpu.go:275] no items to output this cycle
E0320 02:36:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:36:13.409785 543705 cpu.go:282] Add success.
I0320 02:36:13.409787 543705 memory.go:191] Add success.
W0320 02:36:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:36:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:36:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:36:13.420276 543705 net.go:648] Add success.
I0320 02:36:13.422918 543705 net.go:770] primary dev: ETH0
I0320 02:36:13.422934 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:36:13.422949 543705 net.go:698] Add success.
I0320 02:36:13.466321 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"407a10ad-52c9-448f-a955-5c393a92f08a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 02:36:13.466354 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 02:36:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:36:14.455160 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:36:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0320 02:36:14.455173 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:36:14.457242 543705 disk_worker.go:494] system disk:vda1
I0320 02:36:14.457279 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:36:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:36:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:36:16.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:36:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:36:16.472434 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:36:22.177677 543705 disk_info.go:125] begin check local disk info of client
I0320 02:36:22.180212 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:36:22.180218 543705 disk_info.go:196] parse disk info done, disk is : [0xc000390300 0xc000390340]
E0320 02:36:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:36:23.409785 543705 memory.go:184] no items to output this cycle
I0320 02:36:23.409801 543705 cpu.go:275] no items to output this cycle
E0320 02:36:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:36:33.409803 543705 memory.go:184] no items to output this cycle
I0320 02:36:33.409818 543705 cpu.go:275] no items to output this cycle
I0320 02:36:38.057744 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 02:36:38.057750 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 02:36:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:36:43.410690 543705 memory.go:191] Add success.
I0320 02:36:43.409799 543705 cpu.go:282] Add success.
I0320 02:36:43.420435 543705 net.go:648] Add success.
I0320 02:36:43.423159 543705 net.go:770] primary dev: ETH0
I0320 02:36:43.423172 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:36:43.423184 543705 net.go:698] Add success.
I0320 02:36:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:36:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:36:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:36:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:36:53.409807 543705 memory.go:184] no items to output this cycle
I0320 02:36:53.409820 543705 cpu.go:275] no items to output this cycle
E0320 02:37:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:37:03.409772 543705 memory.go:184] no items to output this cycle
I0320 02:37:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 02:37:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:37:13.409803 543705 memory.go:191] Add success.
I0320 02:37:13.409821 543705 cpu.go:282] Add success.
W0320 02:37:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:37:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:37:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:37:13.420147 543705 net.go:648] Add success.
I0320 02:37:13.423219 543705 net.go:770] primary dev: ETH0
I0320 02:37:13.423234 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:37:13.423251 543705 net.go:698] Add success.
I0320 02:37:13.452781 543705 event_worker.go:152] Polling the log file for events...
W0320 02:37:14.454316 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:37:14.454331 543705 disk_worker.go:708] disk space is not compliant
W0320 02:37:14.454335 543705 disk_worker.go:728] disk inode is not compliant
E0320 02:37:14.454971 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 02:37:14.454981 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 02:37:14.454989 543705 custom_config.go:64] query custom config with name: gpu
I0320 02:37:14.456098 543705 disk_worker.go:494] system disk:vda1
I0320 02:37:14.456127 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 02:37:15.456840 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 02:37:15.456849 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:37:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 02:37:16.457984 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 02:37:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:37:16.458044 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:37:16.472403 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:37:22.181677 543705 disk_info.go:125] begin check local disk info of client
I0320 02:37:22.184077 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:37:22.184083 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c41c0 0xc0000c4200]
E0320 02:37:23.410253 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:37:23.410270 543705 memory.go:184] no items to output this cycle
I0320 02:37:23.410285 543705 cpu.go:275] no items to output this cycle
E0320 02:37:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:37:33.409783 543705 memory.go:184] no items to output this cycle
I0320 02:37:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 02:37:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:37:43.409820 543705 memory.go:191] Add success.
I0320 02:37:43.409824 543705 cpu.go:282] Add success.
I0320 02:37:43.419943 543705 net.go:648] Add success.
I0320 02:37:43.422911 543705 net.go:770] primary dev: ETH0
I0320 02:37:43.422924 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:37:43.422936 543705 net.go:698] Add success.
I0320 02:37:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:37:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:37:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:37:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:37:53.409795 543705 memory.go:184] no items to output this cycle
I0320 02:37:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 02:38:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:38:03.409780 543705 memory.go:184] no items to output this cycle
I0320 02:38:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 02:38:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:38:13.409800 543705 memory.go:191] Add success.
I0320 02:38:13.409801 543705 cpu.go:282] Add success.
W0320 02:38:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:38:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:38:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:38:13.420118 543705 net.go:648] Add success.
I0320 02:38:13.422875 543705 net.go:770] primary dev: ETH0
I0320 02:38:13.422889 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:38:13.422901 543705 net.go:698] Add success.
I0320 02:38:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:38:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:38:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0320 02:38:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:38:14.458958 543705 disk_worker.go:494] system disk:vda1
I0320 02:38:14.458986 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:38:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:38:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:38:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:38:16.458077 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:38:16.472416 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:38:22.185675 543705 disk_info.go:125] begin check local disk info of client
I0320 02:38:22.188156 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:38:22.188162 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039e3c0 0xc00039e400]
E0320 02:38:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:38:23.409779 543705 memory.go:184] no items to output this cycle
I0320 02:38:23.409809 543705 cpu.go:275] no items to output this cycle
E0320 02:38:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:38:33.409787 543705 memory.go:184] no items to output this cycle
I0320 02:38:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 02:38:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:38:43.409800 543705 memory.go:191] Add success.
I0320 02:38:43.409805 543705 cpu.go:282] Add success.
I0320 02:38:43.419954 543705 net.go:648] Add success.
I0320 02:38:43.422932 543705 net.go:770] primary dev: ETH0
I0320 02:38:43.422951 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:38:43.422965 543705 net.go:698] Add success.
I0320 02:38:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:38:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:38:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:38:53.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:38:53.409816 543705 memory.go:184] no items to output this cycle
I0320 02:38:53.409823 543705 cpu.go:275] no items to output this cycle
E0320 02:39:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:39:03.409792 543705 memory.go:184] no items to output this cycle
I0320 02:39:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 02:39:13.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:39:13.409838 543705 memory.go:191] Add success.
I0320 02:39:13.409842 543705 cpu.go:282] Add success.
W0320 02:39:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:39:13.409881 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:39:13.409884 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:39:13.420171 543705 net.go:648] Add success.
I0320 02:39:13.423002 543705 net.go:770] primary dev: ETH0
I0320 02:39:13.423016 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:39:13.423029 543705 net.go:698] Add success.
I0320 02:39:13.497584 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"602a7ab7-8bfc-43df-a125-1dd75f5a6f09","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 02:39:13.497616 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 02:39:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:39:14.455128 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:39:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0320 02:39:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:39:14.456854 543705 disk_worker.go:494] system disk:vda1
I0320 02:39:14.456885 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:39:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:39:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:39:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:39:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:39:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:39:22.189679 543705 disk_info.go:125] begin check local disk info of client
I0320 02:39:22.192080 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:39:22.192087 543705 disk_info.go:196] parse disk info done, disk is : [0xc000562300 0xc000562340]
E0320 02:39:23.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:39:23.409767 543705 memory.go:184] no items to output this cycle
I0320 02:39:23.409803 543705 cpu.go:275] no items to output this cycle
E0320 02:39:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:39:33.409787 543705 memory.go:184] no items to output this cycle
I0320 02:39:33.409793 543705 cpu.go:275] no items to output this cycle
I0320 02:39:38.061004 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 02:39:38.061011 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 02:39:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:39:43.410717 543705 memory.go:191] Add success.
I0320 02:39:43.409809 543705 cpu.go:282] Add success.
I0320 02:39:43.420445 543705 net.go:648] Add success.
I0320 02:39:43.423307 543705 net.go:770] primary dev: ETH0
I0320 02:39:43.423322 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:39:43.423337 543705 net.go:698] Add success.
I0320 02:39:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:39:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:39:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:39:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:39:53.409801 543705 memory.go:184] no items to output this cycle
I0320 02:39:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 02:40:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:40:03.409776 543705 memory.go:184] no items to output this cycle
I0320 02:40:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 02:40:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:40:13.409809 543705 memory.go:191] Add success.
I0320 02:40:13.409810 543705 cpu.go:282] Add success.
W0320 02:40:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:40:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:40:13.409853 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:40:13.420308 543705 net.go:648] Add success.
I0320 02:40:13.422991 543705 net.go:770] primary dev: ETH0
I0320 02:40:13.423004 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:40:13.423015 543705 net.go:698] Add success.
I0320 02:40:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:40:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:40:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0320 02:40:14.455180 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:40:14.456499 543705 disk_worker.go:494] system disk:vda1
I0320 02:40:14.456543 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:40:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:40:16.457996 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:40:16.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:40:16.458084 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:40:16.472454 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:40:22.193679 543705 disk_info.go:125] begin check local disk info of client
I0320 02:40:22.196126 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:40:22.196133 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003f2940 0xc0003f2980]
E0320 02:40:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:40:23.409767 543705 memory.go:184] no items to output this cycle
I0320 02:40:23.409811 543705 cpu.go:275] no items to output this cycle
E0320 02:40:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:40:33.409799 543705 memory.go:184] no items to output this cycle
I0320 02:40:33.409823 543705 cpu.go:275] no items to output this cycle
E0320 02:40:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:40:43.409781 543705 memory.go:191] Add success.
I0320 02:40:43.409799 543705 cpu.go:282] Add success.
I0320 02:40:43.419993 543705 net.go:648] Add success.
I0320 02:40:43.423040 543705 net.go:770] primary dev: ETH0
I0320 02:40:43.423057 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:40:43.423071 543705 net.go:698] Add success.
I0320 02:40:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:40:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:40:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:40:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:40:53.409778 543705 memory.go:184] no items to output this cycle
I0320 02:40:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 02:41:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:41:03.409804 543705 memory.go:184] no items to output this cycle
I0320 02:41:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 02:41:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:41:13.409786 543705 memory.go:191] Add success.
W0320 02:41:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 02:41:13.409818 543705 cpu.go:282] Add success.
W0320 02:41:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:41:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:41:13.420189 543705 net.go:648] Add success.
I0320 02:41:13.422830 543705 net.go:770] primary dev: ETH0
I0320 02:41:13.422855 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:41:13.422868 543705 net.go:698] Add success.
I0320 02:41:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:41:14.455165 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:41:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0320 02:41:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:41:14.456524 543705 disk_worker.go:494] system disk:vda1
I0320 02:41:14.456570 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:41:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:41:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:41:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:41:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:41:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:41:22.197672 543705 disk_info.go:125] begin check local disk info of client
I0320 02:41:22.200136 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:41:22.200143 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002ba7c0 0xc0002ba800]
E0320 02:41:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:41:23.409796 543705 memory.go:184] no items to output this cycle
I0320 02:41:23.409808 543705 cpu.go:275] no items to output this cycle
E0320 02:41:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:41:33.409769 543705 memory.go:184] no items to output this cycle
I0320 02:41:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 02:41:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:41:43.409793 543705 memory.go:191] Add success.
I0320 02:41:43.409793 543705 cpu.go:282] Add success.
I0320 02:41:43.419902 543705 net.go:648] Add success.
I0320 02:41:43.422417 543705 net.go:770] primary dev: ETH0
I0320 02:41:43.422436 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:41:43.422452 543705 net.go:698] Add success.
I0320 02:41:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:41:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:41:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:41:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:41:53.409787 543705 memory.go:184] no items to output this cycle
I0320 02:41:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 02:42:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:42:03.409766 543705 memory.go:184] no items to output this cycle
I0320 02:42:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 02:42:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:42:13.409786 543705 memory.go:191] Add success.
I0320 02:42:13.409789 543705 cpu.go:282] Add success.
W0320 02:42:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:42:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:42:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:42:13.420331 543705 net.go:648] Add success.
I0320 02:42:13.422817 543705 net.go:770] primary dev: ETH0
I0320 02:42:13.422831 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:42:13.422845 543705 net.go:698] Add success.
I0320 02:42:13.468775 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e06dfcd9-4903-4054-9919-143f384cdf59","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 02:42:13.468808 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 02:42:14.455107 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:42:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0320 02:42:14.455172 543705 disk_worker.go:728] disk inode is not compliant
E0320 02:42:14.456103 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 02:42:14.456112 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 02:42:14.456118 543705 custom_config.go:64] query custom config with name: gpu
I0320 02:42:14.456516 543705 disk_worker.go:494] system disk:vda1
I0320 02:42:14.456567 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 02:42:15.456826 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 02:42:15.456836 543705 custom_config.go:64] query custom config with name: huawei_npu
E0320 02:42:16.457934 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 02:42:16.457938 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:42:16.458000 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:42:16.458025 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:42:16.472123 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:42:22.201681 543705 disk_info.go:125] begin check local disk info of client
I0320 02:42:22.204240 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:42:22.204248 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000ec640 0xc0000ec680]
E0320 02:42:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:42:23.409766 543705 memory.go:184] no items to output this cycle
I0320 02:42:23.409803 543705 cpu.go:275] no items to output this cycle
E0320 02:42:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:42:33.409788 543705 memory.go:184] no items to output this cycle
I0320 02:42:33.409803 543705 cpu.go:275] no items to output this cycle
I0320 02:42:38.061730 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 02:42:38.061736 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 02:42:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:42:43.410655 543705 memory.go:191] Add success.
I0320 02:42:43.409813 543705 cpu.go:282] Add success.
I0320 02:42:43.420447 543705 net.go:648] Add success.
I0320 02:42:43.423084 543705 net.go:770] primary dev: ETH0
I0320 02:42:43.423099 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:42:43.423113 543705 net.go:698] Add success.
I0320 02:42:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:42:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:42:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:42:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:42:53.409790 543705 cpu.go:275] no items to output this cycle
I0320 02:42:53.409793 543705 memory.go:184] no items to output this cycle
E0320 02:43:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:43:03.409797 543705 memory.go:184] no items to output this cycle
I0320 02:43:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 02:43:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:43:13.409802 543705 memory.go:191] Add success.
I0320 02:43:13.409803 543705 cpu.go:282] Add success.
W0320 02:43:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:43:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:43:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:43:13.420189 543705 net.go:648] Add success.
I0320 02:43:13.422776 543705 net.go:770] primary dev: ETH0
I0320 02:43:13.422789 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:43:13.422802 543705 net.go:698] Add success.
I0320 02:43:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:43:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:43:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 02:43:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:43:14.456577 543705 disk_worker.go:494] system disk:vda1
I0320 02:43:14.456605 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:43:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:43:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:43:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:43:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:43:16.472419 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:43:22.205672 543705 disk_info.go:125] begin check local disk info of client
I0320 02:43:22.208144 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:43:22.208151 543705 disk_info.go:196] parse disk info done, disk is : [0xc00029f600 0xc00029f640]
E0320 02:43:23.409874 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:43:23.409894 543705 memory.go:184] no items to output this cycle
I0320 02:43:23.409971 543705 cpu.go:275] no items to output this cycle
E0320 02:43:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:43:33.409798 543705 memory.go:184] no items to output this cycle
I0320 02:43:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 02:43:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:43:43.409782 543705 memory.go:191] Add success.
I0320 02:43:43.409802 543705 cpu.go:282] Add success.
I0320 02:43:43.419944 543705 net.go:648] Add success.
I0320 02:43:43.422707 543705 net.go:770] primary dev: ETH0
I0320 02:43:43.422720 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:43:43.422732 543705 net.go:698] Add success.
I0320 02:43:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:43:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:43:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:43:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:43:53.409777 543705 memory.go:184] no items to output this cycle
I0320 02:43:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 02:44:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:44:03.409798 543705 memory.go:184] no items to output this cycle
I0320 02:44:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 02:44:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:44:13.409790 543705 memory.go:191] Add success.
I0320 02:44:13.409793 543705 cpu.go:282] Add success.
W0320 02:44:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:44:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:44:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:44:13.420058 543705 net.go:648] Add success.
I0320 02:44:13.422854 543705 net.go:770] primary dev: ETH0
I0320 02:44:13.422867 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:44:13.422879 543705 net.go:698] Add success.
I0320 02:44:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:44:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:44:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 02:44:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:44:14.456590 543705 disk_worker.go:494] system disk:vda1
I0320 02:44:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:44:15.456011 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:44:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:44:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:44:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:44:16.472457 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:44:22.209680 543705 disk_info.go:125] begin check local disk info of client
I0320 02:44:22.212234 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:44:22.212240 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007a740 0xc00007a780]
E0320 02:44:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:44:23.409767 543705 memory.go:184] no items to output this cycle
I0320 02:44:23.409801 543705 cpu.go:275] no items to output this cycle
E0320 02:44:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:44:33.409776 543705 memory.go:184] no items to output this cycle
I0320 02:44:33.409785 543705 cpu.go:275] no items to output this cycle
E0320 02:44:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:44:43.409819 543705 memory.go:191] Add success.
I0320 02:44:43.409821 543705 cpu.go:282] Add success.
I0320 02:44:43.419707 543705 net.go:770] primary dev: ETH0
I0320 02:44:43.419721 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:44:43.419735 543705 net.go:698] Add success.
I0320 02:44:43.420097 543705 net.go:648] Add success.
I0320 02:44:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:44:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:44:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:44:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:44:53.409810 543705 memory.go:184] no items to output this cycle
I0320 02:44:53.409825 543705 cpu.go:275] no items to output this cycle
E0320 02:45:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:45:03.409774 543705 memory.go:184] no items to output this cycle
I0320 02:45:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 02:45:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:45:13.409798 543705 memory.go:191] Add success.
I0320 02:45:13.409801 543705 cpu.go:282] Add success.
W0320 02:45:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:45:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:45:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:45:13.420064 543705 net.go:648] Add success.
I0320 02:45:13.423122 543705 net.go:770] primary dev: ETH0
I0320 02:45:13.423137 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:45:13.423151 543705 net.go:698] Add success.
I0320 02:45:13.469567 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d605e169-59df-4d9c-90ce-3e64f26063f5","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 02:45:13.469599 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 02:45:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:45:14.455149 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:45:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0320 02:45:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:45:14.456548 543705 disk_worker.go:494] system disk:vda1
I0320 02:45:14.456590 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:45:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:45:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:45:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:45:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:45:16.472420 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:45:22.213674 543705 disk_info.go:125] begin check local disk info of client
I0320 02:45:22.216189 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:45:22.216196 543705 disk_info.go:196] parse disk info done, disk is : [0xc000326fc0 0xc000327000]
E0320 02:45:23.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:45:23.409762 543705 memory.go:184] no items to output this cycle
I0320 02:45:23.409785 543705 cpu.go:275] no items to output this cycle
E0320 02:45:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:45:33.409795 543705 memory.go:184] no items to output this cycle
I0320 02:45:33.409808 543705 cpu.go:275] no items to output this cycle
I0320 02:45:38.065017 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 02:45:38.065023 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 02:45:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:45:43.410605 543705 memory.go:191] Add success.
I0320 02:45:43.409799 543705 cpu.go:282] Add success.
I0320 02:45:43.420318 543705 net.go:648] Add success.
I0320 02:45:43.423042 543705 net.go:770] primary dev: ETH0
I0320 02:45:43.423055 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:45:43.423067 543705 net.go:698] Add success.
I0320 02:45:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:45:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:45:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:45:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:45:53.409788 543705 memory.go:184] no items to output this cycle
I0320 02:45:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 02:46:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:46:03.409773 543705 memory.go:184] no items to output this cycle
I0320 02:46:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 02:46:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:46:13.409784 543705 memory.go:191] Add success.
I0320 02:46:13.409808 543705 cpu.go:282] Add success.
W0320 02:46:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:46:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:46:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:46:13.420177 543705 net.go:648] Add success.
I0320 02:46:13.422759 543705 net.go:770] primary dev: ETH0
I0320 02:46:13.422771 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:46:13.422782 543705 net.go:698] Add success.
I0320 02:46:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:46:14.455095 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:46:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0320 02:46:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:46:14.459182 543705 disk_worker.go:494] system disk:vda1
I0320 02:46:14.459211 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:46:15.456013 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:46:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:46:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:46:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:46:16.472431 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:46:22.217672 543705 disk_info.go:125] begin check local disk info of client
I0320 02:46:22.220157 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:46:22.220164 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039ed80 0xc00039edc0]
E0320 02:46:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:46:23.409767 543705 memory.go:184] no items to output this cycle
I0320 02:46:23.409805 543705 cpu.go:275] no items to output this cycle
E0320 02:46:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:46:33.409797 543705 memory.go:184] no items to output this cycle
I0320 02:46:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 02:46:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:46:43.409784 543705 memory.go:191] Add success.
I0320 02:46:43.409800 543705 cpu.go:282] Add success.
I0320 02:46:43.419914 543705 net.go:648] Add success.
I0320 02:46:43.422652 543705 net.go:770] primary dev: ETH0
I0320 02:46:43.422670 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:46:43.422685 543705 net.go:698] Add success.
I0320 02:46:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:46:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:46:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:46:53.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:46:53.409812 543705 memory.go:184] no items to output this cycle
I0320 02:46:53.409823 543705 cpu.go:275] no items to output this cycle
E0320 02:47:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:47:03.409796 543705 memory.go:184] no items to output this cycle
I0320 02:47:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 02:47:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:47:13.409821 543705 memory.go:191] Add success.
I0320 02:47:13.409826 543705 cpu.go:282] Add success.
W0320 02:47:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:47:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:47:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:47:13.420226 543705 net.go:648] Add success.
I0320 02:47:13.422888 543705 net.go:770] primary dev: ETH0
I0320 02:47:13.422902 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:47:13.422917 543705 net.go:698] Add success.
I0320 02:47:13.453458 543705 event_worker.go:152] Polling the log file for events...
W0320 02:47:14.454674 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:47:14.454689 543705 disk_worker.go:708] disk space is not compliant
W0320 02:47:14.454694 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:47:14.456902 543705 disk_worker.go:494] system disk:vda1
I0320 02:47:14.456956 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 02:47:14.457733 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 02:47:14.457754 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 02:47:14.457760 543705 custom_config.go:64] query custom config with name: gpu
E0320 02:47:15.456888 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 02:47:15.456898 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:47:16.457943 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 02:47:16.457944 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 02:47:16.458002 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:47:16.458022 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:47:16.472377 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:47:22.221692 543705 disk_info.go:125] begin check local disk info of client
I0320 02:47:22.224113 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:47:22.224119 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047a7c0 0xc00047a800]
E0320 02:47:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:47:23.409792 543705 memory.go:184] no items to output this cycle
I0320 02:47:23.409811 543705 cpu.go:275] no items to output this cycle
E0320 02:47:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:47:33.409786 543705 cpu.go:275] no items to output this cycle
I0320 02:47:33.409794 543705 memory.go:184] no items to output this cycle
E0320 02:47:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:47:43.409808 543705 memory.go:191] Add success.
I0320 02:47:43.409823 543705 cpu.go:282] Add success.
I0320 02:47:43.419997 543705 net.go:648] Add success.
I0320 02:47:43.422514 543705 net.go:770] primary dev: ETH0
I0320 02:47:43.422527 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:47:43.422539 543705 net.go:698] Add success.
I0320 02:47:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:47:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:47:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:47:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:47:53.409776 543705 memory.go:184] no items to output this cycle
I0320 02:47:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 02:48:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:48:03.409772 543705 memory.go:184] no items to output this cycle
I0320 02:48:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 02:48:13.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:48:13.409799 543705 cpu.go:282] Add success.
I0320 02:48:13.409822 543705 memory.go:191] Add success.
W0320 02:48:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:48:13.409879 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:48:13.409883 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:48:13.420252 543705 net.go:648] Add success.
I0320 02:48:13.421198 543705 net.go:770] primary dev: ETH0
I0320 02:48:13.421211 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:48:13.421224 543705 net.go:698] Add success.
I0320 02:48:13.559394 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5672636e-de0f-4379-8875-ebe0347a47e2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 02:48:13.559435 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 02:48:14.453976 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:48:14.454174 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:48:14.454346 543705 disk_worker.go:708] disk space is not compliant
W0320 02:48:14.454351 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:48:14.456206 543705 disk_worker.go:494] system disk:vda1
I0320 02:48:14.456237 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:48:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:48:16.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:48:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:48:16.458074 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:48:16.472426 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:48:22.225681 543705 disk_info.go:125] begin check local disk info of client
I0320 02:48:22.228158 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:48:22.228165 543705 disk_info.go:196] parse disk info done, disk is : [0xc000304080 0xc0003040c0]
E0320 02:48:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:48:23.409793 543705 memory.go:184] no items to output this cycle
I0320 02:48:23.409806 543705 cpu.go:275] no items to output this cycle
E0320 02:48:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:48:33.409792 543705 cpu.go:275] no items to output this cycle
I0320 02:48:33.409802 543705 memory.go:184] no items to output this cycle
I0320 02:48:38.065737 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 02:48:38.065743 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 02:48:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:48:43.410599 543705 memory.go:191] Add success.
I0320 02:48:43.409821 543705 cpu.go:282] Add success.
I0320 02:48:43.420313 543705 net.go:648] Add success.
I0320 02:48:43.422901 543705 net.go:770] primary dev: ETH0
I0320 02:48:43.422915 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:48:43.422927 543705 net.go:698] Add success.
I0320 02:48:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:48:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:48:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:48:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:48:53.409782 543705 memory.go:184] no items to output this cycle
I0320 02:48:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 02:49:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:49:03.409788 543705 memory.go:184] no items to output this cycle
I0320 02:49:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 02:49:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:49:13.409807 543705 memory.go:191] Add success.
I0320 02:49:13.409820 543705 cpu.go:282] Add success.
W0320 02:49:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:49:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:49:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:49:13.420175 543705 net.go:648] Add success.
I0320 02:49:13.423148 543705 net.go:770] primary dev: ETH0
I0320 02:49:13.423163 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:49:13.423176 543705 net.go:698] Add success.
I0320 02:49:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:49:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:49:14.455320 543705 disk_worker.go:708] disk space is not compliant
W0320 02:49:14.455325 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:49:14.457036 543705 disk_worker.go:494] system disk:vda1
I0320 02:49:14.457065 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:49:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:49:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:49:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:49:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:49:16.472385 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:49:22.229674 543705 disk_info.go:125] begin check local disk info of client
I0320 02:49:22.232158 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:49:22.232165 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003973c0 0xc000397400]
E0320 02:49:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:49:23.409778 543705 memory.go:184] no items to output this cycle
I0320 02:49:23.409785 543705 cpu.go:275] no items to output this cycle
E0320 02:49:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:49:33.409783 543705 memory.go:184] no items to output this cycle
I0320 02:49:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 02:49:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:49:43.409791 543705 memory.go:191] Add success.
I0320 02:49:43.409814 543705 cpu.go:282] Add success.
I0320 02:49:43.419870 543705 net.go:648] Add success.
I0320 02:49:43.422313 543705 net.go:770] primary dev: ETH0
I0320 02:49:43.422327 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:49:43.422339 543705 net.go:698] Add success.
I0320 02:49:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:49:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:49:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:49:53.410247 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:49:53.410270 543705 memory.go:184] no items to output this cycle
I0320 02:49:53.410271 543705 cpu.go:275] no items to output this cycle
E0320 02:50:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:50:03.409806 543705 memory.go:184] no items to output this cycle
I0320 02:50:03.409819 543705 cpu.go:275] no items to output this cycle
E0320 02:50:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:50:13.409788 543705 memory.go:191] Add success.
W0320 02:50:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 02:50:13.409819 543705 cpu.go:282] Add success.
W0320 02:50:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:50:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:50:13.420134 543705 net.go:648] Add success.
I0320 02:50:13.422894 543705 net.go:770] primary dev: ETH0
I0320 02:50:13.422910 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:50:13.422924 543705 net.go:698] Add success.
I0320 02:50:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:50:14.455308 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:50:14.455414 543705 disk_worker.go:708] disk space is not compliant
W0320 02:50:14.455419 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:50:14.456993 543705 disk_worker.go:494] system disk:vda1
I0320 02:50:14.457021 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:50:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:50:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:50:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:50:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:50:16.472416 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:50:22.233678 543705 disk_info.go:125] begin check local disk info of client
I0320 02:50:22.236224 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:50:22.236231 543705 disk_info.go:196] parse disk info done, disk is : [0xc000384080 0xc0003840c0]
E0320 02:50:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:50:23.409781 543705 memory.go:184] no items to output this cycle
I0320 02:50:23.409790 543705 cpu.go:275] no items to output this cycle
E0320 02:50:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:50:33.409782 543705 memory.go:184] no items to output this cycle
I0320 02:50:33.409793 543705 cpu.go:275] no items to output this cycle
E0320 02:50:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:50:43.409797 543705 memory.go:191] Add success.
I0320 02:50:43.409807 543705 cpu.go:282] Add success.
I0320 02:50:43.419901 543705 net.go:648] Add success.
I0320 02:50:43.422693 543705 net.go:770] primary dev: ETH0
I0320 02:50:43.422708 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:50:43.422723 543705 net.go:698] Add success.
I0320 02:50:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:50:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:50:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:50:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:50:53.409786 543705 memory.go:184] no items to output this cycle
I0320 02:50:53.409814 543705 cpu.go:275] no items to output this cycle
E0320 02:51:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:51:03.409787 543705 memory.go:184] no items to output this cycle
I0320 02:51:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 02:51:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:51:13.409809 543705 memory.go:191] Add success.
I0320 02:51:13.409809 543705 cpu.go:282] Add success.
W0320 02:51:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:51:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:51:13.409854 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:51:13.420154 543705 net.go:648] Add success.
I0320 02:51:13.423098 543705 net.go:770] primary dev: ETH0
I0320 02:51:13.423111 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:51:13.423124 543705 net.go:698] Add success.
I0320 02:51:13.469354 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1576fa2b-66e7-4cc0-8c46-cbb674bd3cb1","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 02:51:13.469387 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 02:51:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:51:14.455457 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:51:14.455478 543705 disk_worker.go:708] disk space is not compliant
W0320 02:51:14.455482 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:51:14.457537 543705 disk_worker.go:494] system disk:vda1
I0320 02:51:14.457563 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:51:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:51:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:51:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:51:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:51:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:51:22.237677 543705 disk_info.go:125] begin check local disk info of client
I0320 02:51:22.240114 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:51:22.240120 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004f7e40 0xc0004f7e80]
E0320 02:51:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:51:23.409782 543705 memory.go:184] no items to output this cycle
I0320 02:51:23.409803 543705 cpu.go:275] no items to output this cycle
E0320 02:51:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:51:33.409766 543705 memory.go:184] no items to output this cycle
I0320 02:51:33.409821 543705 cpu.go:275] no items to output this cycle
I0320 02:51:38.069032 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 02:51:38.069038 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 02:51:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:51:43.410648 543705 memory.go:191] Add success.
I0320 02:51:43.409834 543705 cpu.go:282] Add success.
I0320 02:51:43.420289 543705 net.go:648] Add success.
I0320 02:51:43.423115 543705 net.go:770] primary dev: ETH0
I0320 02:51:43.423128 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:51:43.423141 543705 net.go:698] Add success.
I0320 02:51:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:51:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:51:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:51:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:51:53.409777 543705 memory.go:184] no items to output this cycle
I0320 02:51:53.409825 543705 cpu.go:275] no items to output this cycle
E0320 02:52:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:52:03.409783 543705 memory.go:184] no items to output this cycle
I0320 02:52:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 02:52:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:52:13.409792 543705 memory.go:191] Add success.
I0320 02:52:13.409796 543705 cpu.go:282] Add success.
W0320 02:52:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:52:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:52:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:52:13.420132 543705 net.go:648] Add success.
I0320 02:52:13.422974 543705 net.go:770] primary dev: ETH0
I0320 02:52:13.422989 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:52:13.423005 543705 net.go:698] Add success.
W0320 02:52:14.455324 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:52:14.455339 543705 disk_worker.go:708] disk space is not compliant
W0320 02:52:14.455342 543705 disk_worker.go:728] disk inode is not compliant
E0320 02:52:14.456503 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 02:52:14.456512 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 02:52:14.456518 543705 custom_config.go:64] query custom config with name: gpu
I0320 02:52:14.457107 543705 disk_worker.go:494] system disk:vda1
I0320 02:52:14.457140 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 02:52:15.456810 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 02:52:15.456819 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:52:16.457924 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 02:52:16.457924 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 02:52:16.457978 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:52:16.457999 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:52:16.472330 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:52:22.241674 543705 disk_info.go:125] begin check local disk info of client
I0320 02:52:22.244156 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:52:22.244163 543705 disk_info.go:196] parse disk info done, disk is : [0xc00053c640 0xc00053c680]
E0320 02:52:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:52:23.409788 543705 memory.go:184] no items to output this cycle
I0320 02:52:23.409799 543705 cpu.go:275] no items to output this cycle
E0320 02:52:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:52:33.409774 543705 memory.go:184] no items to output this cycle
I0320 02:52:33.409780 543705 cpu.go:275] no items to output this cycle
E0320 02:52:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:52:43.409799 543705 memory.go:191] Add success.
I0320 02:52:43.409800 543705 cpu.go:282] Add success.
I0320 02:52:43.419922 543705 net.go:648] Add success.
I0320 02:52:43.422791 543705 net.go:770] primary dev: ETH0
I0320 02:52:43.422807 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:52:43.422821 543705 net.go:698] Add success.
I0320 02:52:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:52:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:52:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:52:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:52:53.409775 543705 memory.go:184] no items to output this cycle
I0320 02:52:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 02:53:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:53:03.409800 543705 memory.go:184] no items to output this cycle
I0320 02:53:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 02:53:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:53:13.409809 543705 memory.go:191] Add success.
I0320 02:53:13.409811 543705 cpu.go:282] Add success.
W0320 02:53:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:53:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:53:13.409853 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:53:13.420150 543705 net.go:648] Add success.
I0320 02:53:13.422811 543705 net.go:770] primary dev: ETH0
I0320 02:53:13.422825 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:53:13.422838 543705 net.go:698] Add success.
I0320 02:53:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:53:14.455356 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:53:14.455369 543705 disk_worker.go:708] disk space is not compliant
W0320 02:53:14.455373 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:53:14.457006 543705 disk_worker.go:494] system disk:vda1
I0320 02:53:14.457035 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:53:15.455948 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:53:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:53:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:53:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:53:16.472412 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:53:22.245675 543705 disk_info.go:125] begin check local disk info of client
I0320 02:53:22.248107 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:53:22.248113 543705 disk_info.go:196] parse disk info done, disk is : [0xc000467480 0xc0004674c0]
E0320 02:53:23.410697 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:53:23.410710 543705 memory.go:184] no items to output this cycle
I0320 02:53:23.410751 543705 cpu.go:275] no items to output this cycle
E0320 02:53:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:53:33.409770 543705 memory.go:184] no items to output this cycle
I0320 02:53:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 02:53:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:53:43.409816 543705 memory.go:191] Add success.
I0320 02:53:43.409824 543705 cpu.go:282] Add success.
I0320 02:53:43.420018 543705 net.go:648] Add success.
I0320 02:53:43.423003 543705 net.go:770] primary dev: ETH0
I0320 02:53:43.423016 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:53:43.423030 543705 net.go:698] Add success.
I0320 02:53:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:53:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:53:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:53:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:53:53.409779 543705 memory.go:184] no items to output this cycle
I0320 02:53:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 02:54:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:54:03.409776 543705 memory.go:184] no items to output this cycle
I0320 02:54:03.409782 543705 cpu.go:275] no items to output this cycle
E0320 02:54:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:54:13.409810 543705 memory.go:191] Add success.
I0320 02:54:13.409809 543705 cpu.go:282] Add success.
W0320 02:54:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:54:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:54:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:54:13.420341 543705 net.go:648] Add success.
I0320 02:54:13.423071 543705 net.go:770] primary dev: ETH0
I0320 02:54:13.423087 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:54:13.423101 543705 net.go:698] Add success.
I0320 02:54:13.464200 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"03aa716d-a898-45f0-94f8-32e8d4c007b4","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 02:54:13.464232 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 02:54:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:54:14.455350 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:54:14.455364 543705 disk_worker.go:708] disk space is not compliant
W0320 02:54:14.455367 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:54:14.457533 543705 disk_worker.go:494] system disk:vda1
I0320 02:54:14.457576 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:54:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:54:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:54:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:54:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:54:16.472429 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:54:22.249674 543705 disk_info.go:125] begin check local disk info of client
I0320 02:54:22.252171 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:54:22.252177 543705 disk_info.go:196] parse disk info done, disk is : [0xc000538600 0xc000538640]
E0320 02:54:23.409742 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:54:23.409756 543705 memory.go:184] no items to output this cycle
I0320 02:54:23.409797 543705 cpu.go:275] no items to output this cycle
E0320 02:54:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:54:33.409801 543705 memory.go:184] no items to output this cycle
I0320 02:54:33.409817 543705 cpu.go:275] no items to output this cycle
I0320 02:54:38.069729 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 02:54:38.069735 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 02:54:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:54:43.410600 543705 memory.go:191] Add success.
I0320 02:54:43.409804 543705 cpu.go:282] Add success.
I0320 02:54:43.420362 543705 net.go:648] Add success.
I0320 02:54:43.422984 543705 net.go:770] primary dev: ETH0
I0320 02:54:43.422998 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:54:43.423011 543705 net.go:698] Add success.
I0320 02:54:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:54:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:54:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:54:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:54:53.409776 543705 memory.go:184] no items to output this cycle
I0320 02:54:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 02:55:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:55:03.409768 543705 memory.go:184] no items to output this cycle
I0320 02:55:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 02:55:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:55:13.409807 543705 memory.go:191] Add success.
I0320 02:55:13.409808 543705 cpu.go:282] Add success.
W0320 02:55:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:55:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:55:13.409851 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:55:13.420178 543705 net.go:648] Add success.
I0320 02:55:13.422879 543705 net.go:770] primary dev: ETH0
I0320 02:55:13.422892 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:55:13.422905 543705 net.go:698] Add success.
I0320 02:55:14.454978 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:55:14.455328 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:55:14.455427 543705 disk_worker.go:708] disk space is not compliant
W0320 02:55:14.455431 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:55:14.457546 543705 disk_worker.go:494] system disk:vda1
I0320 02:55:14.457586 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:55:15.455949 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:55:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:55:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:55:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:55:16.472403 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:55:22.253679 543705 disk_info.go:125] begin check local disk info of client
I0320 02:55:22.256089 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:55:22.256095 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ba380 0xc0003ba3c0]
E0320 02:55:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:55:23.409783 543705 memory.go:184] no items to output this cycle
I0320 02:55:23.409797 543705 cpu.go:275] no items to output this cycle
E0320 02:55:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:55:33.409769 543705 memory.go:184] no items to output this cycle
I0320 02:55:33.409809 543705 cpu.go:275] no items to output this cycle
E0320 02:55:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:55:43.409792 543705 memory.go:191] Add success.
I0320 02:55:43.409793 543705 cpu.go:282] Add success.
I0320 02:55:43.420012 543705 net.go:648] Add success.
I0320 02:55:43.422906 543705 net.go:770] primary dev: ETH0
I0320 02:55:43.422920 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:55:43.422934 543705 net.go:698] Add success.
I0320 02:55:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:55:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:55:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:55:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:55:53.409808 543705 memory.go:184] no items to output this cycle
I0320 02:55:53.409816 543705 cpu.go:275] no items to output this cycle
E0320 02:56:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:56:03.409776 543705 memory.go:184] no items to output this cycle
I0320 02:56:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 02:56:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:56:13.409799 543705 memory.go:191] Add success.
I0320 02:56:13.409799 543705 cpu.go:282] Add success.
W0320 02:56:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:56:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:56:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:56:13.420170 543705 net.go:648] Add success.
I0320 02:56:13.423033 543705 net.go:770] primary dev: ETH0
I0320 02:56:13.423046 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:56:13.423058 543705 net.go:698] Add success.
I0320 02:56:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:56:14.455185 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:56:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0320 02:56:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:56:14.456588 543705 disk_worker.go:494] system disk:vda1
I0320 02:56:14.456619 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:56:15.456017 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:56:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:56:16.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:56:16.458080 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:56:16.472402 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:56:22.257678 543705 disk_info.go:125] begin check local disk info of client
I0320 02:56:22.260145 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:56:22.260151 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004b4300 0xc0004b4340]
E0320 02:56:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:56:23.409790 543705 memory.go:184] no items to output this cycle
I0320 02:56:23.409804 543705 cpu.go:275] no items to output this cycle
E0320 02:56:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:56:33.409774 543705 memory.go:184] no items to output this cycle
I0320 02:56:33.409785 543705 cpu.go:275] no items to output this cycle
E0320 02:56:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:56:43.409773 543705 memory.go:191] Add success.
I0320 02:56:43.409807 543705 cpu.go:282] Add success.
I0320 02:56:43.419996 543705 net.go:648] Add success.
I0320 02:56:43.422610 543705 net.go:770] primary dev: ETH0
I0320 02:56:43.422623 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:56:43.422635 543705 net.go:698] Add success.
I0320 02:56:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:56:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:56:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:56:53.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:56:53.409821 543705 memory.go:184] no items to output this cycle
I0320 02:56:53.409825 543705 cpu.go:275] no items to output this cycle
E0320 02:57:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:57:03.409785 543705 memory.go:184] no items to output this cycle
I0320 02:57:03.409790 543705 cpu.go:275] no items to output this cycle
E0320 02:57:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:57:13.409811 543705 memory.go:191] Add success.
I0320 02:57:13.409820 543705 cpu.go:282] Add success.
W0320 02:57:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:57:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:57:13.409856 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:57:13.420284 543705 net.go:648] Add success.
I0320 02:57:13.429768 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 02:57:13.429843 543705 net.go:770] primary dev: ETH0
I0320 02:57:13.429856 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:57:13.429868 543705 net.go:698] Add success.
I0320 02:57:13.453425 543705 event_worker.go:152] Polling the log file for events...
I0320 02:57:13.464157 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5459d80f-1332-48b2-9f8d-d78713da2b5f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 02:57:13.464192 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 02:57:14.455379 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:57:14.455394 543705 disk_worker.go:708] disk space is not compliant
W0320 02:57:14.455399 543705 disk_worker.go:728] disk inode is not compliant
E0320 02:57:14.457724 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 02:57:14.457731 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 02:57:14.457735 543705 custom_config.go:64] query custom config with name: gpu
I0320 02:57:14.457763 543705 disk_worker.go:494] system disk:vda1
I0320 02:57:14.457798 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 02:57:15.456834 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 02:57:15.456844 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:57:16.457936 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 02:57:16.457946 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 02:57:16.457987 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:57:16.458004 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:57:16.472350 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:57:22.261672 543705 disk_info.go:125] begin check local disk info of client
I0320 02:57:22.264092 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:57:22.264098 543705 disk_info.go:196] parse disk info done, disk is : [0xc000344140 0xc000344180]
E0320 02:57:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:57:23.409786 543705 memory.go:184] no items to output this cycle
I0320 02:57:23.409791 543705 cpu.go:275] no items to output this cycle
E0320 02:57:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:57:33.409780 543705 memory.go:184] no items to output this cycle
I0320 02:57:33.409786 543705 cpu.go:275] no items to output this cycle
I0320 02:57:38.073045 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 02:57:38.073052 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 02:57:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:57:43.410760 543705 memory.go:191] Add success.
I0320 02:57:43.409811 543705 cpu.go:282] Add success.
I0320 02:57:43.420489 543705 net.go:648] Add success.
I0320 02:57:43.423305 543705 net.go:770] primary dev: ETH0
I0320 02:57:43.423318 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:57:43.423331 543705 net.go:698] Add success.
I0320 02:57:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:57:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:57:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:57:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:57:53.409788 543705 cpu.go:275] no items to output this cycle
I0320 02:57:53.409793 543705 memory.go:184] no items to output this cycle
E0320 02:58:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:58:03.409785 543705 cpu.go:275] no items to output this cycle
I0320 02:58:03.409791 543705 memory.go:184] no items to output this cycle
E0320 02:58:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:58:13.409829 543705 memory.go:191] Add success.
I0320 02:58:13.409831 543705 cpu.go:282] Add success.
W0320 02:58:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:58:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:58:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:58:13.420278 543705 net.go:648] Add success.
I0320 02:58:13.422824 543705 net.go:770] primary dev: ETH0
I0320 02:58:13.422837 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:58:13.422850 543705 net.go:698] Add success.
I0320 02:58:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:58:14.455428 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:58:14.455441 543705 disk_worker.go:708] disk space is not compliant
W0320 02:58:14.455461 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:58:14.457066 543705 disk_worker.go:494] system disk:vda1
I0320 02:58:14.457097 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:58:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:58:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:58:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:58:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:58:16.472375 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:58:22.265676 543705 disk_info.go:125] begin check local disk info of client
I0320 02:58:22.268159 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:58:22.268166 543705 disk_info.go:196] parse disk info done, disk is : [0xc000486180 0xc0004861c0]
E0320 02:58:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:58:23.409782 543705 memory.go:184] no items to output this cycle
I0320 02:58:23.409796 543705 cpu.go:275] no items to output this cycle
E0320 02:58:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:58:33.409783 543705 memory.go:184] no items to output this cycle
I0320 02:58:33.409791 543705 cpu.go:275] no items to output this cycle
E0320 02:58:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:58:43.409793 543705 memory.go:191] Add success.
I0320 02:58:43.409794 543705 cpu.go:282] Add success.
I0320 02:58:43.419993 543705 net.go:648] Add success.
I0320 02:58:43.422715 543705 net.go:770] primary dev: ETH0
I0320 02:58:43.422727 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:58:43.422740 543705 net.go:698] Add success.
I0320 02:58:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:58:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:58:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:58:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:58:53.409788 543705 cpu.go:275] no items to output this cycle
I0320 02:58:53.409790 543705 memory.go:184] no items to output this cycle
E0320 02:59:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:59:03.409778 543705 memory.go:184] no items to output this cycle
I0320 02:59:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 02:59:13.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:59:13.409836 543705 memory.go:191] Add success.
I0320 02:59:13.409842 543705 cpu.go:282] Add success.
W0320 02:59:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 02:59:13.409886 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 02:59:13.409890 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 02:59:13.420341 543705 net.go:648] Add success.
I0320 02:59:13.423202 543705 net.go:770] primary dev: ETH0
I0320 02:59:13.423214 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:59:13.423227 543705 net.go:698] Add success.
I0320 02:59:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 02:59:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 02:59:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0320 02:59:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 02:59:14.456596 543705 disk_worker.go:494] system disk:vda1
I0320 02:59:14.456629 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 02:59:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 02:59:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:59:16.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:59:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0320 02:59:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0320 02:59:22.269672 543705 disk_info.go:125] begin check local disk info of client
I0320 02:59:22.272141 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 02:59:22.272147 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003fe340 0xc0003fe380]
E0320 02:59:23.410450 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:59:23.410467 543705 memory.go:184] no items to output this cycle
I0320 02:59:23.410478 543705 cpu.go:275] no items to output this cycle
E0320 02:59:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:59:33.409768 543705 memory.go:184] no items to output this cycle
I0320 02:59:33.409800 543705 cpu.go:275] no items to output this cycle
E0320 02:59:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:59:43.409815 543705 memory.go:191] Add success.
I0320 02:59:43.409819 543705 cpu.go:282] Add success.
I0320 02:59:43.419848 543705 net.go:648] Add success.
I0320 02:59:43.422492 543705 net.go:770] primary dev: ETH0
I0320 02:59:43.422508 543705 net.go:802] Send network stats successfully!,count is 6
I0320 02:59:43.422522 543705 net.go:698] Add success.
I0320 02:59:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 02:59:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 02:59:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 02:59:53.410244 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 02:59:53.410263 543705 memory.go:184] no items to output this cycle
I0320 02:59:53.410289 543705 cpu.go:275] no items to output this cycle
E0320 03:00:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:00:03.409806 543705 memory.go:184] no items to output this cycle
I0320 03:00:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 03:00:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:00:13.409780 543705 memory.go:191] Add success.
I0320 03:00:13.409805 543705 cpu.go:282] Add success.
W0320 03:00:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:00:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:00:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:00:13.420222 543705 net.go:648] Add success.
I0320 03:00:13.422771 543705 net.go:770] primary dev: ETH0
I0320 03:00:13.422784 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:00:13.422796 543705 net.go:698] Add success.
I0320 03:00:13.470207 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"87a90272-2dfa-48aa-a898-165806da00bd","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 03:00:13.470240 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 03:00:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:00:14.455126 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:00:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0320 03:00:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:00:14.456583 543705 disk_worker.go:494] system disk:vda1
I0320 03:00:14.456615 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:00:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:00:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:00:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:00:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:00:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:00:22.273677 543705 disk_info.go:125] begin check local disk info of client
I0320 03:00:22.276174 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:00:22.276180 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034c140 0xc00034c1c0]
E0320 03:00:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:00:23.409790 543705 memory.go:184] no items to output this cycle
I0320 03:00:23.409802 543705 cpu.go:275] no items to output this cycle
E0320 03:00:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:00:33.409777 543705 memory.go:184] no items to output this cycle
I0320 03:00:33.409781 543705 cpu.go:275] no items to output this cycle
I0320 03:00:38.073732 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 03:00:38.073738 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 03:00:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:00:43.410618 543705 memory.go:191] Add success.
I0320 03:00:43.409822 543705 cpu.go:282] Add success.
I0320 03:00:43.420599 543705 net.go:648] Add success.
I0320 03:00:43.423205 543705 net.go:770] primary dev: ETH0
I0320 03:00:43.423218 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:00:43.423231 543705 net.go:698] Add success.
I0320 03:00:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:00:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:00:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:00:53.410422 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:00:53.410441 543705 memory.go:184] no items to output this cycle
I0320 03:00:53.410445 543705 cpu.go:275] no items to output this cycle
E0320 03:01:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:01:03.409770 543705 memory.go:184] no items to output this cycle
I0320 03:01:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 03:01:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:01:13.409789 543705 memory.go:191] Add success.
W0320 03:01:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 03:01:13.409824 543705 cpu.go:282] Add success.
W0320 03:01:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:01:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:01:13.420220 543705 net.go:648] Add success.
I0320 03:01:13.423152 543705 net.go:770] primary dev: ETH0
I0320 03:01:13.423166 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:01:13.423177 543705 net.go:698] Add success.
I0320 03:01:14.454951 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:01:14.455144 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:01:14.455154 543705 disk_worker.go:708] disk space is not compliant
W0320 03:01:14.455159 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:01:14.456490 543705 disk_worker.go:494] system disk:vda1
I0320 03:01:14.456534 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:01:15.456011 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:01:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:01:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:01:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:01:16.472407 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:01:22.277674 543705 disk_info.go:125] begin check local disk info of client
I0320 03:01:22.280088 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:01:22.280095 543705 disk_info.go:196] parse disk info done, disk is : [0xc000370100 0xc000370140]
E0320 03:01:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:01:23.409781 543705 memory.go:184] no items to output this cycle
I0320 03:01:23.409795 543705 cpu.go:275] no items to output this cycle
E0320 03:01:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:01:33.409796 543705 memory.go:184] no items to output this cycle
I0320 03:01:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 03:01:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:01:43.409808 543705 memory.go:191] Add success.
I0320 03:01:43.409817 543705 cpu.go:282] Add success.
I0320 03:01:43.420016 543705 net.go:648] Add success.
I0320 03:01:43.422848 543705 net.go:770] primary dev: ETH0
I0320 03:01:43.422861 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:01:43.422874 543705 net.go:698] Add success.
I0320 03:01:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:01:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:01:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:01:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:01:53.409808 543705 memory.go:184] no items to output this cycle
I0320 03:01:53.409823 543705 cpu.go:275] no items to output this cycle
E0320 03:02:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:02:03.409785 543705 memory.go:184] no items to output this cycle
I0320 03:02:03.409790 543705 cpu.go:275] no items to output this cycle
E0320 03:02:13.410531 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:02:13.410566 543705 memory.go:191] Add success.
I0320 03:02:13.410571 543705 cpu.go:282] Add success.
W0320 03:02:13.410598 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:02:13.410615 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:02:13.410619 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:02:13.419895 543705 net.go:648] Add success.
I0320 03:02:13.422788 543705 net.go:770] primary dev: ETH0
I0320 03:02:13.422801 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:02:13.422813 543705 net.go:698] Add success.
W0320 03:02:14.455169 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:02:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0320 03:02:14.455183 543705 disk_worker.go:728] disk inode is not compliant
E0320 03:02:14.455873 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 03:02:14.455882 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 03:02:14.455888 543705 custom_config.go:64] query custom config with name: gpu
I0320 03:02:14.456567 543705 disk_worker.go:494] system disk:vda1
I0320 03:02:14.456598 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 03:02:15.456812 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 03:02:15.456821 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:02:16.457934 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 03:02:16.457934 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 03:02:16.457991 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:02:16.458012 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:02:16.472437 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:02:22.281675 543705 disk_info.go:125] begin check local disk info of client
I0320 03:02:22.284177 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:02:22.284184 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b6180 0xc0003b61c0]
E0320 03:02:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:02:23.409771 543705 memory.go:184] no items to output this cycle
I0320 03:02:23.409778 543705 cpu.go:275] no items to output this cycle
E0320 03:02:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:02:33.409769 543705 memory.go:184] no items to output this cycle
I0320 03:02:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 03:02:43.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:02:43.409776 543705 memory.go:191] Add success.
I0320 03:02:43.409809 543705 cpu.go:282] Add success.
I0320 03:02:43.420020 543705 net.go:648] Add success.
I0320 03:02:43.422547 543705 net.go:770] primary dev: ETH0
I0320 03:02:43.422560 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:02:43.422572 543705 net.go:698] Add success.
I0320 03:02:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:02:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:02:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:02:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:02:53.409789 543705 memory.go:184] no items to output this cycle
I0320 03:02:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 03:03:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:03:03.409802 543705 memory.go:184] no items to output this cycle
I0320 03:03:03.409819 543705 cpu.go:275] no items to output this cycle
E0320 03:03:13.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:03:13.409832 543705 memory.go:191] Add success.
I0320 03:03:13.409840 543705 cpu.go:282] Add success.
W0320 03:03:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:03:13.409881 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:03:13.409885 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:03:13.420179 543705 net.go:648] Add success.
I0320 03:03:13.423088 543705 net.go:770] primary dev: ETH0
I0320 03:03:13.423103 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:03:13.423116 543705 net.go:698] Add success.
I0320 03:03:13.464367 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"98a2cecb-d212-4f2e-aec7-cb68319b30c5","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 03:03:13.464399 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 03:03:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:03:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:03:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0320 03:03:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:03:14.456534 543705 disk_worker.go:494] system disk:vda1
I0320 03:03:14.456581 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:03:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:03:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:03:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:03:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:03:16.472392 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:03:22.285671 543705 disk_info.go:125] begin check local disk info of client
I0320 03:03:22.288153 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:03:22.288159 543705 disk_info.go:196] parse disk info done, disk is : [0xc000354340 0xc000354380]
E0320 03:03:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:03:23.409771 543705 memory.go:184] no items to output this cycle
I0320 03:03:23.409777 543705 cpu.go:275] no items to output this cycle
E0320 03:03:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:03:33.409786 543705 cpu.go:275] no items to output this cycle
I0320 03:03:33.409797 543705 memory.go:184] no items to output this cycle
I0320 03:03:38.077063 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 03:03:38.077071 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 03:03:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:03:43.410657 543705 memory.go:191] Add success.
I0320 03:03:43.409793 543705 cpu.go:282] Add success.
I0320 03:03:43.420378 543705 net.go:648] Add success.
I0320 03:03:43.422952 543705 net.go:770] primary dev: ETH0
I0320 03:03:43.422965 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:03:43.422977 543705 net.go:698] Add success.
I0320 03:03:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:03:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:03:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:03:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:03:53.409780 543705 memory.go:184] no items to output this cycle
I0320 03:03:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 03:04:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:04:03.409804 543705 memory.go:184] no items to output this cycle
I0320 03:04:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 03:04:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:04:13.409790 543705 memory.go:191] Add success.
I0320 03:04:13.409791 543705 cpu.go:282] Add success.
W0320 03:04:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:04:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:04:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:04:13.420144 543705 net.go:648] Add success.
I0320 03:04:13.423145 543705 net.go:770] primary dev: ETH0
I0320 03:04:13.423159 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:04:13.423171 543705 net.go:698] Add success.
I0320 03:04:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:04:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:04:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 03:04:14.455182 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:04:14.456559 543705 disk_worker.go:494] system disk:vda1
I0320 03:04:14.456588 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:04:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:04:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:04:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:04:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:04:16.472373 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:04:22.289676 543705 disk_info.go:125] begin check local disk info of client
I0320 03:04:22.292171 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:04:22.292178 543705 disk_info.go:196] parse disk info done, disk is : [0xc00029e2c0 0xc00029e300]
E0320 03:04:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:04:23.409764 543705 memory.go:184] no items to output this cycle
I0320 03:04:23.409795 543705 cpu.go:275] no items to output this cycle
E0320 03:04:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:04:33.409801 543705 memory.go:184] no items to output this cycle
I0320 03:04:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 03:04:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:04:43.409793 543705 memory.go:191] Add success.
I0320 03:04:43.409804 543705 cpu.go:282] Add success.
I0320 03:04:43.419859 543705 net.go:648] Add success.
I0320 03:04:43.422602 543705 net.go:770] primary dev: ETH0
I0320 03:04:43.422617 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:04:43.422633 543705 net.go:698] Add success.
I0320 03:04:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:04:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:04:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:04:53.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:04:53.409815 543705 memory.go:184] no items to output this cycle
I0320 03:04:53.409823 543705 cpu.go:275] no items to output this cycle
E0320 03:05:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:05:03.409781 543705 memory.go:184] no items to output this cycle
I0320 03:05:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 03:05:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:05:13.409809 543705 memory.go:191] Add success.
I0320 03:05:13.409813 543705 cpu.go:282] Add success.
W0320 03:05:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:05:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:05:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:05:13.420169 543705 net.go:648] Add success.
I0320 03:05:13.423062 543705 net.go:770] primary dev: ETH0
I0320 03:05:13.423075 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:05:13.423087 543705 net.go:698] Add success.
I0320 03:05:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:05:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:05:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 03:05:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:05:14.456605 543705 disk_worker.go:494] system disk:vda1
I0320 03:05:14.456638 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:05:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:05:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:05:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:05:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:05:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:05:22.293678 543705 disk_info.go:125] begin check local disk info of client
I0320 03:05:22.296053 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:05:22.296060 543705 disk_info.go:196] parse disk info done, disk is : [0xc000495e80 0xc000495ec0]
E0320 03:05:23.409862 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:05:23.409878 543705 memory.go:184] no items to output this cycle
I0320 03:05:23.409952 543705 cpu.go:275] no items to output this cycle
E0320 03:05:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:05:33.409776 543705 memory.go:184] no items to output this cycle
I0320 03:05:33.409789 543705 cpu.go:275] no items to output this cycle
E0320 03:05:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:05:43.409818 543705 memory.go:191] Add success.
I0320 03:05:43.409823 543705 cpu.go:282] Add success.
I0320 03:05:43.419940 543705 net.go:648] Add success.
I0320 03:05:43.422883 543705 net.go:770] primary dev: ETH0
I0320 03:05:43.422895 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:05:43.422908 543705 net.go:698] Add success.
I0320 03:05:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:05:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:05:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:05:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:05:53.409779 543705 memory.go:184] no items to output this cycle
I0320 03:05:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 03:06:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:06:03.409772 543705 memory.go:184] no items to output this cycle
I0320 03:06:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 03:06:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:06:13.409813 543705 memory.go:191] Add success.
I0320 03:06:13.409821 543705 cpu.go:282] Add success.
W0320 03:06:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:06:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:06:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:06:13.420162 543705 net.go:648] Add success.
I0320 03:06:13.422766 543705 net.go:770] primary dev: ETH0
I0320 03:06:13.422779 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:06:13.422792 543705 net.go:698] Add success.
I0320 03:06:13.465602 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"46fd14b9-47b9-4aab-8592-e13441e9ea6c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 03:06:13.465642 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 03:06:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:06:14.455145 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:06:14.455155 543705 disk_worker.go:708] disk space is not compliant
W0320 03:06:14.455158 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:06:14.456488 543705 disk_worker.go:494] system disk:vda1
I0320 03:06:14.456533 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:06:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:06:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:06:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:06:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:06:16.472371 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:06:22.297679 543705 disk_info.go:125] begin check local disk info of client
I0320 03:06:22.300099 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:06:22.300106 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047ce80 0xc00047cec0]
E0320 03:06:23.409849 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:06:23.409872 543705 memory.go:184] no items to output this cycle
I0320 03:06:23.409983 543705 cpu.go:275] no items to output this cycle
E0320 03:06:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:06:33.409777 543705 memory.go:184] no items to output this cycle
I0320 03:06:33.409789 543705 cpu.go:275] no items to output this cycle
I0320 03:06:38.077733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 03:06:38.077740 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 03:06:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:06:43.410734 543705 memory.go:191] Add success.
I0320 03:06:43.409830 543705 cpu.go:282] Add success.
I0320 03:06:43.420244 543705 net.go:770] primary dev: ETH0
I0320 03:06:43.420259 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:06:43.420275 543705 net.go:698] Add success.
I0320 03:06:43.420621 543705 net.go:648] Add success.
I0320 03:06:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:06:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:06:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:06:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:06:53.409790 543705 memory.go:184] no items to output this cycle
I0320 03:06:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 03:07:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:07:03.409807 543705 memory.go:184] no items to output this cycle
I0320 03:07:03.409819 543705 cpu.go:275] no items to output this cycle
E0320 03:07:13.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:07:13.409837 543705 memory.go:191] Add success.
I0320 03:07:13.409838 543705 cpu.go:282] Add success.
W0320 03:07:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:07:13.409878 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:07:13.409882 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:07:13.420229 543705 net.go:648] Add success.
I0320 03:07:13.423006 543705 net.go:770] primary dev: ETH0
I0320 03:07:13.423018 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:07:13.423030 543705 net.go:698] Add success.
I0320 03:07:13.453561 543705 event_worker.go:152] Polling the log file for events...
W0320 03:07:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:07:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 03:07:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:07:14.456792 543705 disk_worker.go:494] system disk:vda1
I0320 03:07:14.456831 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 03:07:14.457224 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 03:07:14.457232 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 03:07:14.457237 543705 custom_config.go:64] query custom config with name: gpu
E0320 03:07:15.456849 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 03:07:15.456859 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:07:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 03:07:16.457979 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 03:07:16.458021 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:07:16.458037 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:07:16.472379 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:07:22.301679 543705 disk_info.go:125] begin check local disk info of client
I0320 03:07:22.304059 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:07:22.304065 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032b800 0xc00032b840]
E0320 03:07:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:07:23.409789 543705 memory.go:184] no items to output this cycle
I0320 03:07:23.409802 543705 cpu.go:275] no items to output this cycle
E0320 03:07:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:07:33.409790 543705 memory.go:184] no items to output this cycle
I0320 03:07:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 03:07:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:07:43.409796 543705 memory.go:191] Add success.
I0320 03:07:43.409796 543705 cpu.go:282] Add success.
I0320 03:07:43.419831 543705 net.go:770] primary dev: ETH0
I0320 03:07:43.419843 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:07:43.419855 543705 net.go:698] Add success.
I0320 03:07:43.420083 543705 net.go:648] Add success.
I0320 03:07:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:07:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:07:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:07:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:07:53.409797 543705 memory.go:184] no items to output this cycle
I0320 03:07:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 03:08:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:08:03.409793 543705 memory.go:184] no items to output this cycle
I0320 03:08:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 03:08:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:08:13.409808 543705 memory.go:191] Add success.
I0320 03:08:13.409831 543705 cpu.go:282] Add success.
W0320 03:08:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:08:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:08:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:08:13.420037 543705 net.go:648] Add success.
I0320 03:08:13.422760 543705 net.go:770] primary dev: ETH0
I0320 03:08:13.422773 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:08:13.422784 543705 net.go:698] Add success.
I0320 03:08:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:08:14.455106 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:08:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 03:08:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:08:14.456580 543705 disk_worker.go:494] system disk:vda1
I0320 03:08:14.456611 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:08:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:08:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:08:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:08:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:08:16.472386 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:08:22.305681 543705 disk_info.go:125] begin check local disk info of client
I0320 03:08:22.308148 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:08:22.308154 543705 disk_info.go:196] parse disk info done, disk is : [0xc00024e600 0xc00024e640]
E0320 03:08:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:08:23.409797 543705 memory.go:184] no items to output this cycle
I0320 03:08:23.409811 543705 cpu.go:275] no items to output this cycle
E0320 03:08:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:08:33.409799 543705 memory.go:184] no items to output this cycle
I0320 03:08:33.409804 543705 cpu.go:275] no items to output this cycle
E0320 03:08:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:08:43.409799 543705 memory.go:191] Add success.
I0320 03:08:43.409803 543705 cpu.go:282] Add success.
I0320 03:08:43.419984 543705 net.go:648] Add success.
I0320 03:08:43.422582 543705 net.go:770] primary dev: ETH0
I0320 03:08:43.422594 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:08:43.422606 543705 net.go:698] Add success.
I0320 03:08:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:08:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:08:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:08:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:08:53.409807 543705 memory.go:184] no items to output this cycle
I0320 03:08:53.409823 543705 cpu.go:275] no items to output this cycle
E0320 03:09:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:09:03.409807 543705 memory.go:184] no items to output this cycle
I0320 03:09:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 03:09:13.409806 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:09:13.409842 543705 memory.go:191] Add success.
I0320 03:09:13.409850 543705 cpu.go:282] Add success.
W0320 03:09:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:09:13.409891 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:09:13.409896 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:09:13.420210 543705 net.go:648] Add success.
I0320 03:09:13.423031 543705 net.go:770] primary dev: ETH0
I0320 03:09:13.423045 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:09:13.423056 543705 net.go:698] Add success.
I0320 03:09:13.469374 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3971057d-2688-4551-abd3-48f3283a5ba3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 03:09:13.469407 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 03:09:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:09:14.455152 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:09:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0320 03:09:14.455165 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:09:14.456481 543705 disk_worker.go:494] system disk:vda1
I0320 03:09:14.456526 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:09:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:09:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:09:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:09:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:09:16.472423 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:09:22.309671 543705 disk_info.go:125] begin check local disk info of client
I0320 03:09:22.312070 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:09:22.312076 543705 disk_info.go:196] parse disk info done, disk is : [0xc000344240 0xc000344280]
E0320 03:09:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:09:23.409793 543705 memory.go:184] no items to output this cycle
I0320 03:09:23.409807 543705 cpu.go:275] no items to output this cycle
I0320 03:09:33.409911 543705 cpu.go:275] no items to output this cycle
E0320 03:09:33.410053 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:09:33.410069 543705 memory.go:184] no items to output this cycle
I0320 03:09:38.077880 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 03:09:38.077887 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 03:09:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:09:43.410663 543705 memory.go:191] Add success.
I0320 03:09:43.409820 543705 cpu.go:282] Add success.
I0320 03:09:43.420418 543705 net.go:648] Add success.
I0320 03:09:43.422904 543705 net.go:770] primary dev: ETH0
I0320 03:09:43.422917 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:09:43.422929 543705 net.go:698] Add success.
I0320 03:09:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:09:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:09:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:09:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:09:53.409798 543705 memory.go:184] no items to output this cycle
I0320 03:09:53.409819 543705 cpu.go:275] no items to output this cycle
E0320 03:10:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:10:03.409776 543705 memory.go:184] no items to output this cycle
I0320 03:10:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 03:10:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:10:13.409826 543705 memory.go:191] Add success.
I0320 03:10:13.409835 543705 cpu.go:282] Add success.
W0320 03:10:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:10:13.409875 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:10:13.409879 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:10:13.420184 543705 net.go:648] Add success.
I0320 03:10:13.422882 543705 net.go:770] primary dev: ETH0
I0320 03:10:13.422894 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:10:13.422906 543705 net.go:698] Add success.
I0320 03:10:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:10:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:10:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 03:10:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:10:14.456550 543705 disk_worker.go:494] system disk:vda1
I0320 03:10:14.456580 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:10:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:10:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:10:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:10:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:10:16.472417 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:10:22.313675 543705 disk_info.go:125] begin check local disk info of client
I0320 03:10:22.316144 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:10:22.316150 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b8dc0 0xc0003b8e00]
E0320 03:10:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:10:23.409807 543705 memory.go:184] no items to output this cycle
I0320 03:10:23.409820 543705 cpu.go:275] no items to output this cycle
E0320 03:10:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:10:33.409808 543705 memory.go:184] no items to output this cycle
I0320 03:10:33.409823 543705 cpu.go:275] no items to output this cycle
E0320 03:10:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:10:43.409788 543705 memory.go:191] Add success.
I0320 03:10:43.409819 543705 cpu.go:282] Add success.
I0320 03:10:43.419853 543705 net.go:648] Add success.
I0320 03:10:43.422571 543705 net.go:770] primary dev: ETH0
I0320 03:10:43.422583 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:10:43.422595 543705 net.go:698] Add success.
I0320 03:10:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:10:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:10:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:10:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:10:53.409787 543705 memory.go:184] no items to output this cycle
I0320 03:10:53.409826 543705 cpu.go:275] no items to output this cycle
E0320 03:11:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:11:03.409800 543705 memory.go:184] no items to output this cycle
I0320 03:11:03.409821 543705 cpu.go:275] no items to output this cycle
I0320 03:11:13.409809 543705 cpu.go:282] Add success.
E0320 03:11:13.409810 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:11:13.409847 543705 memory.go:191] Add success.
W0320 03:11:13.409879 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:11:13.409895 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:11:13.409899 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:11:13.420239 543705 net.go:648] Add success.
I0320 03:11:13.423203 543705 net.go:770] primary dev: ETH0
I0320 03:11:13.423216 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:11:13.423229 543705 net.go:698] Add success.
I0320 03:11:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:11:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:11:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 03:11:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:11:14.456595 543705 disk_worker.go:494] system disk:vda1
I0320 03:11:14.456625 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:11:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:11:16.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:11:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:11:16.458081 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:11:16.472451 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:11:22.317676 543705 disk_info.go:125] begin check local disk info of client
I0320 03:11:22.320179 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:11:22.320186 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b9600 0xc0002b9640]
E0320 03:11:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:11:23.409786 543705 memory.go:184] no items to output this cycle
I0320 03:11:23.409803 543705 cpu.go:275] no items to output this cycle
E0320 03:11:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:11:33.409772 543705 memory.go:184] no items to output this cycle
I0320 03:11:33.409791 543705 cpu.go:275] no items to output this cycle
E0320 03:11:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:11:43.409779 543705 memory.go:191] Add success.
I0320 03:11:43.409799 543705 cpu.go:282] Add success.
I0320 03:11:43.420223 543705 net.go:648] Add success.
I0320 03:11:43.422932 543705 net.go:770] primary dev: ETH0
I0320 03:11:43.422945 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:11:43.422957 543705 net.go:698] Add success.
I0320 03:11:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:11:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:11:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:11:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:11:53.409795 543705 memory.go:184] no items to output this cycle
I0320 03:11:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 03:12:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:12:03.409776 543705 memory.go:184] no items to output this cycle
I0320 03:12:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 03:12:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:12:13.409814 543705 memory.go:191] Add success.
I0320 03:12:13.409816 543705 cpu.go:282] Add success.
W0320 03:12:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:12:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:12:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:12:13.420304 543705 net.go:648] Add success.
I0320 03:12:13.422921 543705 net.go:770] primary dev: ETH0
I0320 03:12:13.422936 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:12:13.422947 543705 net.go:698] Add success.
I0320 03:12:13.469297 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ebd67f9b-c743-4a08-a276-47e6bdabcfdf","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 03:12:13.469331 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 03:12:14.455162 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:12:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0320 03:12:14.455176 543705 disk_worker.go:728] disk inode is not compliant
E0320 03:12:14.456181 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 03:12:14.456191 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 03:12:14.456197 543705 custom_config.go:64] query custom config with name: gpu
I0320 03:12:14.456460 543705 disk_worker.go:494] system disk:vda1
I0320 03:12:14.456490 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 03:12:15.456818 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 03:12:15.456827 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:12:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 03:12:16.457988 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 03:12:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:12:16.458046 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:12:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:12:22.321676 543705 disk_info.go:125] begin check local disk info of client
I0320 03:12:22.324219 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:12:22.324224 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b9c80 0xc0002b9cc0]
E0320 03:12:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:12:23.409775 543705 memory.go:184] no items to output this cycle
I0320 03:12:23.409794 543705 cpu.go:275] no items to output this cycle
E0320 03:12:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:12:33.409798 543705 memory.go:184] no items to output this cycle
I0320 03:12:33.409810 543705 cpu.go:275] no items to output this cycle
I0320 03:12:38.081084 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 03:12:38.081091 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 03:12:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:12:43.410643 543705 memory.go:191] Add success.
I0320 03:12:43.409792 543705 cpu.go:282] Add success.
I0320 03:12:43.420741 543705 net.go:648] Add success.
I0320 03:12:43.423490 543705 net.go:770] primary dev: ETH0
I0320 03:12:43.423504 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:12:43.423515 543705 net.go:698] Add success.
I0320 03:12:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:12:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:12:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:12:53.410367 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:12:53.410386 543705 memory.go:184] no items to output this cycle
I0320 03:12:53.410399 543705 cpu.go:275] no items to output this cycle
E0320 03:13:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:13:03.409801 543705 memory.go:184] no items to output this cycle
I0320 03:13:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 03:13:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:13:13.409789 543705 memory.go:191] Add success.
W0320 03:13:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:13:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:13:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:13:13.409856 543705 cpu.go:282] Add success.
I0320 03:13:13.420315 543705 net.go:648] Add success.
I0320 03:13:13.421231 543705 net.go:770] primary dev: ETH0
I0320 03:13:13.421246 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:13:13.421259 543705 net.go:698] Add success.
I0320 03:13:14.454996 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:13:14.455135 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:13:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0320 03:13:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:13:14.456545 543705 disk_worker.go:494] system disk:vda1
I0320 03:13:14.456595 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:13:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:13:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:13:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:13:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:13:16.472381 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:13:22.325683 543705 disk_info.go:125] begin check local disk info of client
I0320 03:13:22.328089 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:13:22.328095 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b8940 0xc0002b8980]
E0320 03:13:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:13:23.409785 543705 memory.go:184] no items to output this cycle
I0320 03:13:23.409796 543705 cpu.go:275] no items to output this cycle
E0320 03:13:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:13:33.409784 543705 memory.go:184] no items to output this cycle
I0320 03:13:33.409789 543705 cpu.go:275] no items to output this cycle
E0320 03:13:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:13:43.409813 543705 memory.go:191] Add success.
I0320 03:13:43.409824 543705 cpu.go:282] Add success.
I0320 03:13:43.420056 543705 net.go:648] Add success.
I0320 03:13:43.422922 543705 net.go:770] primary dev: ETH0
I0320 03:13:43.422935 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:13:43.422947 543705 net.go:698] Add success.
I0320 03:13:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:13:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:13:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:13:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:13:53.409792 543705 memory.go:184] no items to output this cycle
I0320 03:13:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 03:14:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:14:03.409773 543705 memory.go:184] no items to output this cycle
I0320 03:14:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 03:14:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:14:13.409787 543705 memory.go:191] Add success.
I0320 03:14:13.409806 543705 cpu.go:282] Add success.
W0320 03:14:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:14:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:14:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:14:13.420128 543705 net.go:648] Add success.
I0320 03:14:13.423172 543705 net.go:770] primary dev: ETH0
I0320 03:14:13.423185 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:14:13.423197 543705 net.go:698] Add success.
I0320 03:14:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:14:14.455137 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:14:14.455147 543705 disk_worker.go:708] disk space is not compliant
W0320 03:14:14.455150 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:14:14.456501 543705 disk_worker.go:494] system disk:vda1
I0320 03:14:14.456544 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:14:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:14:16.457663 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:14:16.457721 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:14:16.457740 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:14:16.473050 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:14:22.329676 543705 disk_info.go:125] begin check local disk info of client
I0320 03:14:22.332159 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:14:22.332166 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b8d40 0xc0002b8d80]
E0320 03:14:23.410406 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:14:23.410422 543705 memory.go:184] no items to output this cycle
I0320 03:14:23.410431 543705 cpu.go:275] no items to output this cycle
E0320 03:14:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:14:33.409800 543705 memory.go:184] no items to output this cycle
I0320 03:14:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 03:14:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:14:43.409794 543705 memory.go:191] Add success.
I0320 03:14:43.409799 543705 cpu.go:282] Add success.
I0320 03:14:43.419881 543705 net.go:648] Add success.
I0320 03:14:43.422394 543705 net.go:770] primary dev: ETH0
I0320 03:14:43.422411 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:14:43.422424 543705 net.go:698] Add success.
I0320 03:14:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:14:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:14:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:14:53.409885 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:14:53.409916 543705 memory.go:184] no items to output this cycle
I0320 03:14:53.409976 543705 cpu.go:275] no items to output this cycle
E0320 03:15:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:15:03.409776 543705 memory.go:184] no items to output this cycle
I0320 03:15:03.409777 543705 cpu.go:275] no items to output this cycle
E0320 03:15:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:15:13.409810 543705 memory.go:191] Add success.
I0320 03:15:13.409822 543705 cpu.go:282] Add success.
W0320 03:15:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:15:13.412516 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:15:13.412521 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:15:13.420208 543705 net.go:648] Add success.
I0320 03:15:13.421978 543705 net.go:770] primary dev: ETH0
I0320 03:15:13.421993 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:15:13.422006 543705 net.go:698] Add success.
I0320 03:15:13.471424 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"76fdfcf5-cd4f-4f75-997b-f461619d135a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 03:15:13.471468 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 03:15:14.454980 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:15:14.455204 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:15:14.455216 543705 disk_worker.go:708] disk space is not compliant
W0320 03:15:14.455219 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:15:14.456737 543705 disk_worker.go:494] system disk:vda1
I0320 03:15:14.456771 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:15:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:15:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:15:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:15:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:15:16.472392 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:15:22.333670 543705 disk_info.go:125] begin check local disk info of client
I0320 03:15:22.336116 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:15:22.336123 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b880 0xc00007b8c0]
E0320 03:15:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:15:23.409788 543705 memory.go:184] no items to output this cycle
I0320 03:15:23.409802 543705 cpu.go:275] no items to output this cycle
E0320 03:15:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:15:33.409769 543705 memory.go:184] no items to output this cycle
I0320 03:15:33.409793 543705 cpu.go:275] no items to output this cycle
I0320 03:15:38.081731 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 03:15:38.081737 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 03:15:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:15:43.410704 543705 memory.go:191] Add success.
I0320 03:15:43.409826 543705 cpu.go:282] Add success.
I0320 03:15:43.420419 543705 net.go:648] Add success.
I0320 03:15:43.423189 543705 net.go:770] primary dev: ETH0
I0320 03:15:43.423203 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:15:43.423216 543705 net.go:698] Add success.
I0320 03:15:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:15:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:15:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:15:53.409898 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:15:53.409919 543705 memory.go:184] no items to output this cycle
I0320 03:15:53.409951 543705 cpu.go:275] no items to output this cycle
E0320 03:16:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:16:03.409802 543705 memory.go:184] no items to output this cycle
I0320 03:16:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 03:16:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:16:13.409782 543705 memory.go:191] Add success.
I0320 03:16:13.409806 543705 cpu.go:282] Add success.
W0320 03:16:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:16:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:16:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:16:13.420264 543705 net.go:648] Add success.
I0320 03:16:13.422811 543705 net.go:770] primary dev: ETH0
I0320 03:16:13.422827 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:16:13.422841 543705 net.go:698] Add success.
I0320 03:16:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:16:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:16:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 03:16:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:16:14.456592 543705 disk_worker.go:494] system disk:vda1
I0320 03:16:14.456621 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:16:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:16:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:16:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:16:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:16:16.472425 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:16:22.337673 543705 disk_info.go:125] begin check local disk info of client
I0320 03:16:22.340232 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:16:22.340239 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000ed100 0xc0000ed140]
E0320 03:16:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:16:23.409771 543705 memory.go:184] no items to output this cycle
I0320 03:16:23.409779 543705 cpu.go:275] no items to output this cycle
E0320 03:16:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:16:33.409798 543705 memory.go:184] no items to output this cycle
I0320 03:16:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 03:16:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:16:43.409817 543705 memory.go:191] Add success.
I0320 03:16:43.409823 543705 cpu.go:282] Add success.
I0320 03:16:43.419983 543705 net.go:648] Add success.
I0320 03:16:43.422959 543705 net.go:770] primary dev: ETH0
I0320 03:16:43.422973 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:16:43.422989 543705 net.go:698] Add success.
I0320 03:16:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:16:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:16:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:16:53.410532 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:16:53.410553 543705 memory.go:184] no items to output this cycle
I0320 03:16:53.410565 543705 cpu.go:275] no items to output this cycle
E0320 03:17:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:17:03.409773 543705 memory.go:184] no items to output this cycle
I0320 03:17:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 03:17:13.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:17:13.409777 543705 memory.go:191] Add success.
W0320 03:17:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:17:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:17:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:17:13.409851 543705 cpu.go:282] Add success.
I0320 03:17:13.420505 543705 net.go:648] Add success.
I0320 03:17:13.423345 543705 net.go:770] primary dev: ETH0
I0320 03:17:13.423364 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:17:13.423381 543705 net.go:698] Add success.
I0320 03:17:13.453360 543705 event_worker.go:152] Polling the log file for events...
W0320 03:17:14.455212 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:17:14.455226 543705 disk_worker.go:708] disk space is not compliant
W0320 03:17:14.455229 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:17:14.456631 543705 disk_worker.go:494] system disk:vda1
I0320 03:17:14.456663 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 03:17:14.458174 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 03:17:14.458183 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 03:17:14.458189 543705 custom_config.go:64] query custom config with name: gpu
E0320 03:17:15.456853 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 03:17:15.456864 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:17:16.457907 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 03:17:16.457907 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 03:17:16.457964 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:17:16.457983 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:17:16.472311 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:17:22.341675 543705 disk_info.go:125] begin check local disk info of client
I0320 03:17:22.344097 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:17:22.344103 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003148c0 0xc000314900]
E0320 03:17:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:17:23.409787 543705 memory.go:184] no items to output this cycle
I0320 03:17:23.409799 543705 cpu.go:275] no items to output this cycle
E0320 03:17:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:17:33.409800 543705 memory.go:184] no items to output this cycle
I0320 03:17:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 03:17:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:17:43.409776 543705 memory.go:191] Add success.
I0320 03:17:43.409805 543705 cpu.go:282] Add success.
I0320 03:17:43.419861 543705 net.go:648] Add success.
I0320 03:17:43.422501 543705 net.go:770] primary dev: ETH0
I0320 03:17:43.422517 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:17:43.422532 543705 net.go:698] Add success.
I0320 03:17:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:17:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:17:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:17:53.409840 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:17:53.409859 543705 memory.go:184] no items to output this cycle
I0320 03:17:53.409941 543705 cpu.go:275] no items to output this cycle
E0320 03:18:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:18:03.409802 543705 memory.go:184] no items to output this cycle
I0320 03:18:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 03:18:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:18:13.409789 543705 memory.go:191] Add success.
I0320 03:18:13.409797 543705 cpu.go:282] Add success.
W0320 03:18:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:18:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:18:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:18:13.420088 543705 net.go:648] Add success.
I0320 03:18:13.422799 543705 net.go:770] primary dev: ETH0
I0320 03:18:13.422812 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:18:13.422824 543705 net.go:698] Add success.
I0320 03:18:13.469318 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"261ac0fd-92db-411f-bcac-a27b2f503ae3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 03:18:13.469352 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 03:18:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:18:14.455194 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:18:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 03:18:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:18:14.456600 543705 disk_worker.go:494] system disk:vda1
I0320 03:18:14.456630 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:18:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:18:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:18:16.458024 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:18:16.458046 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:18:16.472359 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:18:22.345672 543705 disk_info.go:125] begin check local disk info of client
I0320 03:18:22.348117 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:18:22.348124 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003779c0 0xc000377a00]
E0320 03:18:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:18:23.409777 543705 memory.go:184] no items to output this cycle
I0320 03:18:23.409800 543705 cpu.go:275] no items to output this cycle
E0320 03:18:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:18:33.409773 543705 memory.go:184] no items to output this cycle
I0320 03:18:33.409811 543705 cpu.go:275] no items to output this cycle
I0320 03:18:38.081874 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 03:18:38.081881 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 03:18:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:18:43.410774 543705 memory.go:191] Add success.
I0320 03:18:43.409918 543705 cpu.go:282] Add success.
I0320 03:18:43.420544 543705 net.go:648] Add success.
I0320 03:18:43.423409 543705 net.go:770] primary dev: ETH0
I0320 03:18:43.423422 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:18:43.423434 543705 net.go:698] Add success.
I0320 03:18:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:18:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:18:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:18:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:18:53.409800 543705 memory.go:184] no items to output this cycle
I0320 03:18:53.409829 543705 cpu.go:275] no items to output this cycle
E0320 03:19:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:19:03.409783 543705 memory.go:184] no items to output this cycle
I0320 03:19:03.409802 543705 cpu.go:275] no items to output this cycle
E0320 03:19:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:19:13.409793 543705 memory.go:191] Add success.
W0320 03:19:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 03:19:13.409821 543705 cpu.go:282] Add success.
W0320 03:19:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:19:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:19:13.420214 543705 net.go:648] Add success.
I0320 03:19:13.422996 543705 net.go:770] primary dev: ETH0
I0320 03:19:13.423009 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:19:13.423021 543705 net.go:698] Add success.
I0320 03:19:14.454981 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:19:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:19:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 03:19:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:19:14.456558 543705 disk_worker.go:494] system disk:vda1
I0320 03:19:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:19:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:19:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:19:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:19:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:19:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:19:22.349674 543705 disk_info.go:125] begin check local disk info of client
I0320 03:19:22.352125 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:19:22.352131 543705 disk_info.go:196] parse disk info done, disk is : [0xc000395300 0xc000395340]
E0320 03:19:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:19:23.409792 543705 memory.go:184] no items to output this cycle
I0320 03:19:23.409809 543705 cpu.go:275] no items to output this cycle
E0320 03:19:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:19:33.409788 543705 memory.go:184] no items to output this cycle
I0320 03:19:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 03:19:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:19:43.409815 543705 memory.go:191] Add success.
I0320 03:19:43.409854 543705 cpu.go:282] Add success.
I0320 03:19:43.420166 543705 net.go:648] Add success.
I0320 03:19:43.423282 543705 net.go:770] primary dev: ETH0
I0320 03:19:43.423296 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:19:43.423310 543705 net.go:698] Add success.
I0320 03:19:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:19:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:19:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:19:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:19:53.409792 543705 memory.go:184] no items to output this cycle
I0320 03:19:53.409943 543705 cpu.go:275] no items to output this cycle
E0320 03:20:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:20:03.409804 543705 memory.go:184] no items to output this cycle
I0320 03:20:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 03:20:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:20:13.409792 543705 memory.go:191] Add success.
I0320 03:20:13.409795 543705 cpu.go:282] Add success.
W0320 03:20:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:20:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:20:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:20:13.420100 543705 net.go:648] Add success.
I0320 03:20:13.422674 543705 net.go:770] primary dev: ETH0
I0320 03:20:13.422687 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:20:13.422698 543705 net.go:698] Add success.
I0320 03:20:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:20:14.455095 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:20:14.455157 543705 disk_worker.go:708] disk space is not compliant
W0320 03:20:14.455159 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:20:14.456489 543705 disk_worker.go:494] system disk:vda1
I0320 03:20:14.456533 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:20:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:20:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:20:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:20:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:20:16.472367 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:20:22.353678 543705 disk_info.go:125] begin check local disk info of client
I0320 03:20:22.356170 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:20:22.356176 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5780 0xc0000c57c0]
E0320 03:20:23.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:20:23.409760 543705 memory.go:184] no items to output this cycle
I0320 03:20:23.409797 543705 cpu.go:275] no items to output this cycle
E0320 03:20:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:20:33.409789 543705 memory.go:184] no items to output this cycle
I0320 03:20:33.409802 543705 cpu.go:275] no items to output this cycle
E0320 03:20:43.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:20:43.409834 543705 memory.go:191] Add success.
I0320 03:20:43.409845 543705 cpu.go:282] Add success.
I0320 03:20:43.420035 543705 net.go:648] Add success.
I0320 03:20:43.422754 543705 net.go:770] primary dev: ETH0
I0320 03:20:43.422767 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:20:43.422794 543705 net.go:698] Add success.
I0320 03:20:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:20:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:20:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:20:53.409804 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:20:53.409826 543705 memory.go:184] no items to output this cycle
I0320 03:20:53.409834 543705 cpu.go:275] no items to output this cycle
E0320 03:21:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:21:03.409909 543705 memory.go:184] no items to output this cycle
I0320 03:21:03.409985 543705 cpu.go:275] no items to output this cycle
E0320 03:21:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:21:13.409807 543705 memory.go:191] Add success.
I0320 03:21:13.409809 543705 cpu.go:282] Add success.
W0320 03:21:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:21:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:21:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:21:13.420158 543705 net.go:648] Add success.
I0320 03:21:13.423092 543705 net.go:770] primary dev: ETH0
I0320 03:21:13.423105 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:21:13.423116 543705 net.go:698] Add success.
I0320 03:21:13.941897 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"be26a690-f320-4b56-aa71-1de4a6facaa4","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 03:21:13.941946 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 03:21:14.454524 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:21:14.454692 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:21:14.454782 543705 disk_worker.go:708] disk space is not compliant
W0320 03:21:14.454786 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:21:14.456287 543705 disk_worker.go:494] system disk:vda1
I0320 03:21:14.456321 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:21:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:21:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:21:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:21:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:21:16.472410 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:21:22.357675 543705 disk_info.go:125] begin check local disk info of client
I0320 03:21:22.360153 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:21:22.360160 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000ecd80 0xc0000ecdc0]
E0320 03:21:23.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:21:23.409758 543705 memory.go:184] no items to output this cycle
I0320 03:21:23.409796 543705 cpu.go:275] no items to output this cycle
E0320 03:21:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:21:33.409798 543705 memory.go:184] no items to output this cycle
I0320 03:21:33.409820 543705 cpu.go:275] no items to output this cycle
I0320 03:21:38.085106 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 03:21:38.085112 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 03:21:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:21:43.410675 543705 memory.go:191] Add success.
I0320 03:21:43.409802 543705 cpu.go:282] Add success.
I0320 03:21:43.420392 543705 net.go:648] Add success.
I0320 03:21:43.422956 543705 net.go:770] primary dev: ETH0
I0320 03:21:43.422969 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:21:43.422983 543705 net.go:698] Add success.
I0320 03:21:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:21:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:21:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:21:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:21:53.409786 543705 memory.go:184] no items to output this cycle
I0320 03:21:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 03:22:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:22:03.409783 543705 memory.go:184] no items to output this cycle
I0320 03:22:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 03:22:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:22:13.409781 543705 memory.go:191] Add success.
W0320 03:22:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 03:22:13.409815 543705 cpu.go:282] Add success.
W0320 03:22:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:22:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:22:13.420120 543705 net.go:648] Add success.
I0320 03:22:13.423120 543705 net.go:770] primary dev: ETH0
I0320 03:22:13.423133 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:22:13.423145 543705 net.go:698] Add success.
W0320 03:22:14.455102 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:22:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0320 03:22:14.455165 543705 disk_worker.go:728] disk inode is not compliant
E0320 03:22:14.456935 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 03:22:14.456944 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 03:22:14.456950 543705 custom_config.go:64] query custom config with name: gpu
I0320 03:22:14.456990 543705 disk_worker.go:494] system disk:vda1
I0320 03:22:14.457030 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 03:22:15.456812 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 03:22:15.456822 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:22:16.457949 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 03:22:16.457960 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 03:22:16.458000 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:22:16.458016 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:22:16.472332 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:22:22.361675 543705 disk_info.go:125] begin check local disk info of client
I0320 03:22:22.364230 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:22:22.364237 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab700 0xc0001ab740]
E0320 03:22:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:22:23.409789 543705 memory.go:184] no items to output this cycle
I0320 03:22:23.409803 543705 cpu.go:275] no items to output this cycle
E0320 03:22:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:22:33.409806 543705 memory.go:184] no items to output this cycle
I0320 03:22:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 03:22:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:22:43.409799 543705 memory.go:191] Add success.
I0320 03:22:43.409801 543705 cpu.go:282] Add success.
I0320 03:22:43.419903 543705 net.go:648] Add success.
I0320 03:22:43.422842 543705 net.go:770] primary dev: ETH0
I0320 03:22:43.422857 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:22:43.422878 543705 net.go:698] Add success.
I0320 03:22:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:22:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:22:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:22:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:22:53.409780 543705 memory.go:184] no items to output this cycle
I0320 03:22:53.409889 543705 cpu.go:275] no items to output this cycle
E0320 03:23:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:23:03.409769 543705 memory.go:184] no items to output this cycle
I0320 03:23:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 03:23:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:23:13.409914 543705 memory.go:191] Add success.
W0320 03:23:13.409946 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:23:13.409966 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:23:13.409966 543705 cpu.go:282] Add success.
I0320 03:23:13.409969 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:23:13.419717 543705 net.go:648] Add success.
I0320 03:23:13.422832 543705 net.go:770] primary dev: ETH0
I0320 03:23:13.422845 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:23:13.422857 543705 net.go:698] Add success.
I0320 03:23:14.454984 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:23:14.455138 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:23:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 03:23:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:23:14.456590 543705 disk_worker.go:494] system disk:vda1
I0320 03:23:14.456640 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:23:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:23:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:23:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:23:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:23:16.472390 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:23:22.365674 543705 disk_info.go:125] begin check local disk info of client
I0320 03:23:22.368127 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:23:22.368133 543705 disk_info.go:196] parse disk info done, disk is : [0xc000538f40 0xc000538f80]
E0320 03:23:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:23:23.409783 543705 memory.go:184] no items to output this cycle
I0320 03:23:23.409797 543705 cpu.go:275] no items to output this cycle
E0320 03:23:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:23:33.409773 543705 memory.go:184] no items to output this cycle
I0320 03:23:33.409798 543705 cpu.go:275] no items to output this cycle
E0320 03:23:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:23:43.409817 543705 memory.go:191] Add success.
I0320 03:23:43.409823 543705 cpu.go:282] Add success.
I0320 03:23:43.419877 543705 net.go:648] Add success.
I0320 03:23:43.422892 543705 net.go:770] primary dev: ETH0
I0320 03:23:43.422916 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:23:43.422931 543705 net.go:698] Add success.
I0320 03:23:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:23:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:23:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:23:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:23:53.409782 543705 memory.go:184] no items to output this cycle
I0320 03:23:53.409856 543705 cpu.go:275] no items to output this cycle
E0320 03:24:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:24:03.409782 543705 memory.go:184] no items to output this cycle
I0320 03:24:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 03:24:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:24:13.409787 543705 memory.go:191] Add success.
I0320 03:24:13.409792 543705 cpu.go:282] Add success.
W0320 03:24:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:24:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:24:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:24:13.420072 543705 net.go:648] Add success.
I0320 03:24:13.422997 543705 net.go:770] primary dev: ETH0
I0320 03:24:13.423010 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:24:13.423022 543705 net.go:698] Add success.
I0320 03:24:13.469228 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9c0897d3-450e-4452-95b9-ccfedbf16aeb","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 03:24:13.469350 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 03:24:14.454953 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:24:14.455098 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:24:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0320 03:24:14.455165 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:24:14.456488 543705 disk_worker.go:494] system disk:vda1
I0320 03:24:14.456516 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:24:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:24:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:24:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:24:16.458045 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:24:16.472373 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:24:22.369675 543705 disk_info.go:125] begin check local disk info of client
I0320 03:24:22.372203 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:24:22.372208 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037d4c0 0xc00037d500]
E0320 03:24:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:24:23.409777 543705 memory.go:184] no items to output this cycle
I0320 03:24:23.409777 543705 cpu.go:275] no items to output this cycle
E0320 03:24:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:24:33.409794 543705 memory.go:184] no items to output this cycle
I0320 03:24:33.409806 543705 cpu.go:275] no items to output this cycle
I0320 03:24:38.085743 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 03:24:38.085749 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 03:24:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:24:43.410593 543705 memory.go:191] Add success.
I0320 03:24:43.409807 543705 cpu.go:282] Add success.
I0320 03:24:43.420289 543705 net.go:648] Add success.
I0320 03:24:43.422798 543705 net.go:770] primary dev: ETH0
I0320 03:24:43.422812 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:24:43.422827 543705 net.go:698] Add success.
I0320 03:24:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:24:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:24:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:24:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:24:53.409786 543705 memory.go:184] no items to output this cycle
I0320 03:24:53.409851 543705 cpu.go:275] no items to output this cycle
E0320 03:25:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:25:03.409772 543705 memory.go:184] no items to output this cycle
I0320 03:25:03.409802 543705 cpu.go:275] no items to output this cycle
E0320 03:25:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:25:13.409816 543705 memory.go:191] Add success.
I0320 03:25:13.409822 543705 cpu.go:282] Add success.
W0320 03:25:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:25:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:25:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:25:13.420133 543705 net.go:648] Add success.
I0320 03:25:13.422919 543705 net.go:770] primary dev: ETH0
I0320 03:25:13.422934 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:25:13.422949 543705 net.go:698] Add success.
I0320 03:25:14.454992 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:25:14.455234 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:25:14.455246 543705 disk_worker.go:708] disk space is not compliant
W0320 03:25:14.455250 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:25:14.456633 543705 disk_worker.go:494] system disk:vda1
I0320 03:25:14.456666 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:25:15.456014 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:25:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:25:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:25:16.458077 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:25:16.472495 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:25:22.373674 543705 disk_info.go:125] begin check local disk info of client
I0320 03:25:22.376104 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:25:22.376110 543705 disk_info.go:196] parse disk info done, disk is : [0xc000534280 0xc0005342c0]
E0320 03:25:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:25:23.409788 543705 memory.go:184] no items to output this cycle
I0320 03:25:23.409800 543705 cpu.go:275] no items to output this cycle
E0320 03:25:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:25:33.409777 543705 memory.go:184] no items to output this cycle
I0320 03:25:33.409782 543705 cpu.go:275] no items to output this cycle
E0320 03:25:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:25:43.409787 543705 memory.go:191] Add success.
I0320 03:25:43.409811 543705 cpu.go:282] Add success.
I0320 03:25:43.419984 543705 net.go:648] Add success.
I0320 03:25:43.422609 543705 net.go:770] primary dev: ETH0
I0320 03:25:43.422622 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:25:43.422636 543705 net.go:698] Add success.
I0320 03:25:46.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:25:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:25:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:25:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 03:25:53.409804 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:25:53.409825 543705 memory.go:184] no items to output this cycle
E0320 03:26:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:26:03.409782 543705 memory.go:184] no items to output this cycle
I0320 03:26:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 03:26:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:26:13.409792 543705 memory.go:191] Add success.
I0320 03:26:13.409798 543705 cpu.go:282] Add success.
W0320 03:26:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:26:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:26:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:26:13.420193 543705 net.go:648] Add success.
I0320 03:26:13.422953 543705 net.go:770] primary dev: ETH0
I0320 03:26:13.422965 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:26:13.422977 543705 net.go:698] Add success.
I0320 03:26:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:26:14.455137 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:26:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0320 03:26:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:26:14.456588 543705 disk_worker.go:494] system disk:vda1
I0320 03:26:14.456616 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:26:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:26:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:26:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:26:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:26:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:26:22.377677 543705 disk_info.go:125] begin check local disk info of client
I0320 03:26:22.380231 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:26:22.380237 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003f01c0 0xc0003f0200]
E0320 03:26:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:26:23.409774 543705 memory.go:184] no items to output this cycle
I0320 03:26:23.409775 543705 cpu.go:275] no items to output this cycle
E0320 03:26:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:26:33.409775 543705 memory.go:184] no items to output this cycle
I0320 03:26:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 03:26:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:26:43.409791 543705 memory.go:191] Add success.
I0320 03:26:43.409795 543705 cpu.go:282] Add success.
I0320 03:26:43.420044 543705 net.go:648] Add success.
I0320 03:26:43.422639 543705 net.go:770] primary dev: ETH0
I0320 03:26:43.422653 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:26:43.422667 543705 net.go:698] Add success.
I0320 03:26:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:26:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:26:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:26:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:26:53.409809 543705 memory.go:184] no items to output this cycle
I0320 03:26:53.409824 543705 cpu.go:275] no items to output this cycle
E0320 03:27:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:27:03.409804 543705 memory.go:184] no items to output this cycle
I0320 03:27:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 03:27:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:27:13.409812 543705 memory.go:191] Add success.
I0320 03:27:13.409820 543705 cpu.go:282] Add success.
W0320 03:27:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:27:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:27:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:27:13.420050 543705 net.go:648] Add success.
I0320 03:27:13.422748 543705 net.go:770] primary dev: ETH0
I0320 03:27:13.422763 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:27:13.422778 543705 net.go:698] Add success.
I0320 03:27:13.429085 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 03:27:13.453259 543705 event_worker.go:152] Polling the log file for events...
I0320 03:27:13.469075 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"effa47fd-490c-428d-b64f-cf4df90cc415","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 03:27:13.469108 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 03:27:14.455220 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:27:14.455234 543705 disk_worker.go:708] disk space is not compliant
W0320 03:27:14.455237 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:27:14.456654 543705 disk_worker.go:494] system disk:vda1
I0320 03:27:14.456685 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 03:27:14.458197 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 03:27:14.458206 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 03:27:14.458210 543705 custom_config.go:64] query custom config with name: gpu
E0320 03:27:15.456852 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 03:27:15.456862 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:27:16.457925 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 03:27:16.457925 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 03:27:16.457982 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:27:16.458002 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:27:16.472337 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:27:22.381676 543705 disk_info.go:125] begin check local disk info of client
I0320 03:27:22.384100 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:27:22.384107 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ee140 0xc0003ee180]
E0320 03:27:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:27:23.409794 543705 memory.go:184] no items to output this cycle
I0320 03:27:23.409802 543705 cpu.go:275] no items to output this cycle
E0320 03:27:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:27:33.409784 543705 memory.go:184] no items to output this cycle
I0320 03:27:33.409784 543705 cpu.go:275] no items to output this cycle
I0320 03:27:38.089137 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 03:27:38.089144 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 03:27:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:27:43.410617 543705 memory.go:191] Add success.
I0320 03:27:43.409805 543705 cpu.go:282] Add success.
I0320 03:27:43.420350 543705 net.go:648] Add success.
I0320 03:27:43.423150 543705 net.go:770] primary dev: ETH0
I0320 03:27:43.423165 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:27:43.423179 543705 net.go:698] Add success.
I0320 03:27:46.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:27:46.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:27:46.458056 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:27:53.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:27:53.409809 543705 memory.go:184] no items to output this cycle
I0320 03:27:53.409823 543705 cpu.go:275] no items to output this cycle
E0320 03:28:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:28:03.409776 543705 memory.go:184] no items to output this cycle
I0320 03:28:03.409808 543705 cpu.go:275] no items to output this cycle
E0320 03:28:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:28:13.409800 543705 memory.go:191] Add success.
I0320 03:28:13.409805 543705 cpu.go:282] Add success.
W0320 03:28:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:28:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:28:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:28:13.420133 543705 net.go:648] Add success.
I0320 03:28:13.422853 543705 net.go:770] primary dev: ETH0
I0320 03:28:13.422869 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:28:13.422884 543705 net.go:698] Add success.
I0320 03:28:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:28:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:28:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 03:28:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:28:14.456524 543705 disk_worker.go:494] system disk:vda1
I0320 03:28:14.456567 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:28:15.456021 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:28:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:28:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:28:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:28:16.472397 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:28:22.385674 543705 disk_info.go:125] begin check local disk info of client
I0320 03:28:22.388120 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:28:22.388126 543705 disk_info.go:196] parse disk info done, disk is : [0xc00029ac40 0xc00029ac80]
E0320 03:28:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:28:23.409791 543705 memory.go:184] no items to output this cycle
I0320 03:28:23.409803 543705 cpu.go:275] no items to output this cycle
E0320 03:28:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:28:33.409774 543705 memory.go:184] no items to output this cycle
I0320 03:28:33.409807 543705 cpu.go:275] no items to output this cycle
E0320 03:28:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:28:43.409815 543705 memory.go:191] Add success.
I0320 03:28:43.409824 543705 cpu.go:282] Add success.
I0320 03:28:43.419888 543705 net.go:648] Add success.
I0320 03:28:43.422678 543705 net.go:770] primary dev: ETH0
I0320 03:28:43.422692 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:28:43.422704 543705 net.go:698] Add success.
I0320 03:28:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:28:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:28:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:28:53.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:28:53.409803 543705 cpu.go:275] no items to output this cycle
I0320 03:28:53.409817 543705 memory.go:184] no items to output this cycle
E0320 03:29:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:29:03.409772 543705 memory.go:184] no items to output this cycle
I0320 03:29:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 03:29:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:29:13.409817 543705 memory.go:191] Add success.
I0320 03:29:13.409826 543705 cpu.go:282] Add success.
W0320 03:29:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:29:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:29:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:29:13.420160 543705 net.go:648] Add success.
I0320 03:29:13.422774 543705 net.go:770] primary dev: ETH0
I0320 03:29:13.422789 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:29:13.422802 543705 net.go:698] Add success.
I0320 03:29:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:29:14.455223 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:29:14.455235 543705 disk_worker.go:708] disk space is not compliant
W0320 03:29:14.455238 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:29:14.456673 543705 disk_worker.go:494] system disk:vda1
I0320 03:29:14.456708 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:29:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:29:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:29:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:29:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:29:16.472432 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:29:22.389667 543705 disk_info.go:125] begin check local disk info of client
I0320 03:29:22.392080 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:29:22.392086 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004fb9c0 0xc0004fba00]
E0320 03:29:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:29:23.409790 543705 memory.go:184] no items to output this cycle
I0320 03:29:23.409801 543705 cpu.go:275] no items to output this cycle
E0320 03:29:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:29:33.409779 543705 memory.go:184] no items to output this cycle
I0320 03:29:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 03:29:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:29:43.409783 543705 memory.go:191] Add success.
I0320 03:29:43.409807 543705 cpu.go:282] Add success.
I0320 03:29:43.419914 543705 net.go:648] Add success.
I0320 03:29:43.422701 543705 net.go:770] primary dev: ETH0
I0320 03:29:43.422716 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:29:43.422730 543705 net.go:698] Add success.
I0320 03:29:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:29:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:29:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:29:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:29:53.409768 543705 memory.go:184] no items to output this cycle
I0320 03:29:53.409859 543705 cpu.go:275] no items to output this cycle
E0320 03:30:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:30:03.409785 543705 memory.go:184] no items to output this cycle
I0320 03:30:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 03:30:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:30:13.409784 543705 memory.go:191] Add success.
I0320 03:30:13.409786 543705 cpu.go:282] Add success.
W0320 03:30:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:30:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:30:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:30:13.420081 543705 net.go:648] Add success.
I0320 03:30:13.422770 543705 net.go:770] primary dev: ETH0
I0320 03:30:13.422784 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:30:13.422797 543705 net.go:698] Add success.
I0320 03:30:13.481043 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a514af7d-aebc-49db-9716-f864709e7a39","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 03:30:13.481075 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 03:30:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:30:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:30:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0320 03:30:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:30:14.456553 543705 disk_worker.go:494] system disk:vda1
I0320 03:30:14.456608 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:30:15.455602 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:30:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:30:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:30:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:30:16.472380 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:30:22.393677 543705 disk_info.go:125] begin check local disk info of client
I0320 03:30:22.396160 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:30:22.396166 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032a480 0xc00032a4c0]
E0320 03:30:23.409906 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:30:23.409927 543705 memory.go:184] no items to output this cycle
I0320 03:30:23.409906 543705 cpu.go:275] no items to output this cycle
E0320 03:30:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:30:33.409782 543705 memory.go:184] no items to output this cycle
I0320 03:30:33.409793 543705 cpu.go:275] no items to output this cycle
I0320 03:30:38.089734 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 03:30:38.089741 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 03:30:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:30:43.410642 543705 memory.go:191] Add success.
I0320 03:30:43.409828 543705 cpu.go:282] Add success.
I0320 03:30:43.420323 543705 net.go:648] Add success.
I0320 03:30:43.422771 543705 net.go:770] primary dev: ETH0
I0320 03:30:43.422784 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:30:43.422796 543705 net.go:698] Add success.
I0320 03:30:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:30:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:30:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:30:53.409741 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:30:53.409756 543705 memory.go:184] no items to output this cycle
I0320 03:30:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 03:31:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:31:03.409786 543705 memory.go:184] no items to output this cycle
I0320 03:31:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 03:31:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:31:13.409788 543705 memory.go:191] Add success.
I0320 03:31:13.409790 543705 cpu.go:282] Add success.
W0320 03:31:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:31:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:31:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:31:13.420056 543705 net.go:648] Add success.
I0320 03:31:13.422688 543705 net.go:770] primary dev: ETH0
I0320 03:31:13.422703 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:31:13.422717 543705 net.go:698] Add success.
I0320 03:31:14.454988 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:31:14.455202 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:31:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0320 03:31:14.455217 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:31:14.456568 543705 disk_worker.go:494] system disk:vda1
I0320 03:31:14.456615 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:31:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:31:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:31:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:31:16.458074 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:31:16.472409 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:31:22.397672 543705 disk_info.go:125] begin check local disk info of client
I0320 03:31:22.400075 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:31:22.400082 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002737c0 0xc000273800]
E0320 03:31:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:31:23.409798 543705 memory.go:184] no items to output this cycle
I0320 03:31:23.409817 543705 cpu.go:275] no items to output this cycle
I0320 03:31:33.409886 543705 cpu.go:275] no items to output this cycle
E0320 03:31:33.410025 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:31:33.410037 543705 memory.go:184] no items to output this cycle
E0320 03:31:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:31:43.409791 543705 memory.go:191] Add success.
I0320 03:31:43.409800 543705 cpu.go:282] Add success.
I0320 03:31:43.419917 543705 net.go:648] Add success.
I0320 03:31:43.422938 543705 net.go:770] primary dev: ETH0
I0320 03:31:43.422952 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:31:43.422964 543705 net.go:698] Add success.
I0320 03:31:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:31:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:31:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:31:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:31:53.409793 543705 memory.go:184] no items to output this cycle
I0320 03:31:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 03:32:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:32:03.409781 543705 memory.go:184] no items to output this cycle
I0320 03:32:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 03:32:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:32:13.409789 543705 memory.go:191] Add success.
I0320 03:32:13.409790 543705 cpu.go:282] Add success.
W0320 03:32:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:32:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:32:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:32:13.420332 543705 net.go:648] Add success.
I0320 03:32:13.423164 543705 net.go:770] primary dev: ETH0
I0320 03:32:13.423177 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:32:13.423189 543705 net.go:698] Add success.
W0320 03:32:14.455134 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:32:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 03:32:14.455201 543705 disk_worker.go:728] disk inode is not compliant
E0320 03:32:14.455865 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 03:32:14.455873 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 03:32:14.455880 543705 custom_config.go:64] query custom config with name: gpu
I0320 03:32:14.456587 543705 disk_worker.go:494] system disk:vda1
I0320 03:32:14.456617 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 03:32:15.456838 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 03:32:15.456846 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:32:16.457918 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 03:32:16.457936 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 03:32:16.457980 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:32:16.457996 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:32:16.472306 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:32:22.402120 543705 disk_info.go:125] begin check local disk info of client
I0320 03:32:22.404696 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:32:22.404703 543705 disk_info.go:196] parse disk info done, disk is : [0xc000315440 0xc000315480]
E0320 03:32:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:32:23.409759 543705 memory.go:184] no items to output this cycle
I0320 03:32:23.409796 543705 cpu.go:275] no items to output this cycle
E0320 03:32:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:32:33.409789 543705 memory.go:184] no items to output this cycle
I0320 03:32:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 03:32:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:32:43.409791 543705 memory.go:191] Add success.
I0320 03:32:43.409793 543705 cpu.go:282] Add success.
I0320 03:32:43.419967 543705 net.go:648] Add success.
I0320 03:32:43.422512 543705 net.go:770] primary dev: ETH0
I0320 03:32:43.422525 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:32:43.422537 543705 net.go:698] Add success.
I0320 03:32:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:32:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:32:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:32:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:32:53.409764 543705 memory.go:184] no items to output this cycle
I0320 03:32:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 03:33:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:33:03.409786 543705 memory.go:184] no items to output this cycle
I0320 03:33:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 03:33:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:33:13.409784 543705 memory.go:191] Add success.
I0320 03:33:13.409806 543705 cpu.go:282] Add success.
W0320 03:33:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:33:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:33:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:33:13.420210 543705 net.go:648] Add success.
I0320 03:33:13.422732 543705 net.go:770] primary dev: ETH0
I0320 03:33:13.422745 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:33:13.422756 543705 net.go:698] Add success.
I0320 03:33:13.939245 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a7bce0dc-1a7f-42e9-918f-8ee007a2507c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 03:33:13.939279 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 03:33:14.454889 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:33:14.454901 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:33:14.454978 543705 disk_worker.go:708] disk space is not compliant
W0320 03:33:14.454982 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:33:14.457455 543705 disk_worker.go:494] system disk:vda1
I0320 03:33:14.457498 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:33:15.455980 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:33:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:33:16.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:33:16.458082 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:33:16.472560 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:33:22.405674 543705 disk_info.go:125] begin check local disk info of client
I0320 03:33:22.408219 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:33:22.408227 543705 disk_info.go:196] parse disk info done, disk is : [0xc000265f40 0xc0003fe000]
E0320 03:33:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:33:23.409785 543705 memory.go:184] no items to output this cycle
I0320 03:33:23.409798 543705 cpu.go:275] no items to output this cycle
E0320 03:33:33.409906 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:33:33.409914 543705 cpu.go:275] no items to output this cycle
I0320 03:33:33.409932 543705 memory.go:184] no items to output this cycle
I0320 03:33:38.089875 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 03:33:38.089882 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 03:33:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:33:43.410662 543705 memory.go:191] Add success.
I0320 03:33:43.409798 543705 cpu.go:282] Add success.
I0320 03:33:43.420353 543705 net.go:648] Add success.
I0320 03:33:43.422941 543705 net.go:770] primary dev: ETH0
I0320 03:33:43.422954 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:33:43.422966 543705 net.go:698] Add success.
I0320 03:33:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:33:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:33:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:33:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:33:53.409783 543705 memory.go:184] no items to output this cycle
I0320 03:33:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 03:34:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:34:03.409809 543705 memory.go:184] no items to output this cycle
I0320 03:34:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 03:34:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:34:13.409812 543705 memory.go:191] Add success.
I0320 03:34:13.409819 543705 cpu.go:282] Add success.
W0320 03:34:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:34:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:34:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:34:13.420107 543705 net.go:648] Add success.
I0320 03:34:13.422991 543705 net.go:770] primary dev: ETH0
I0320 03:34:13.423005 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:34:13.423029 543705 net.go:698] Add success.
I0320 03:34:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:34:14.455154 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:34:14.455164 543705 disk_worker.go:708] disk space is not compliant
W0320 03:34:14.455167 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:34:14.456482 543705 disk_worker.go:494] system disk:vda1
I0320 03:34:14.456525 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:34:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:34:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:34:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:34:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:34:16.472385 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:34:22.409676 543705 disk_info.go:125] begin check local disk info of client
I0320 03:34:22.412184 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:34:22.412191 543705 disk_info.go:196] parse disk info done, disk is : [0xc000289e00 0xc000289e40]
E0320 03:34:23.409734 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:34:23.409835 543705 memory.go:184] no items to output this cycle
I0320 03:34:23.409924 543705 cpu.go:275] no items to output this cycle
E0320 03:34:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:34:33.409768 543705 memory.go:184] no items to output this cycle
I0320 03:34:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 03:34:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:34:43.409792 543705 memory.go:191] Add success.
I0320 03:34:43.409801 543705 cpu.go:282] Add success.
I0320 03:34:43.420001 543705 net.go:648] Add success.
I0320 03:34:43.422676 543705 net.go:770] primary dev: ETH0
I0320 03:34:43.422689 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:34:43.422702 543705 net.go:698] Add success.
I0320 03:34:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:34:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:34:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:34:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:34:53.409781 543705 memory.go:184] no items to output this cycle
I0320 03:34:53.409783 543705 cpu.go:275] no items to output this cycle
E0320 03:35:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:35:03.409805 543705 memory.go:184] no items to output this cycle
I0320 03:35:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 03:35:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:35:13.409812 543705 memory.go:191] Add success.
I0320 03:35:13.409823 543705 cpu.go:282] Add success.
W0320 03:35:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:35:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:35:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:35:13.420065 543705 net.go:648] Add success.
I0320 03:35:13.422713 543705 net.go:770] primary dev: ETH0
I0320 03:35:13.422726 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:35:13.422738 543705 net.go:698] Add success.
I0320 03:35:14.453946 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:35:14.455233 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:35:14.455341 543705 disk_worker.go:708] disk space is not compliant
W0320 03:35:14.455346 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:35:14.457229 543705 disk_worker.go:494] system disk:vda1
I0320 03:35:14.457266 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:35:15.455982 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:35:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:35:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:35:16.458078 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:35:16.472447 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:35:22.412794 543705 disk_info.go:125] begin check local disk info of client
I0320 03:35:22.415349 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:35:22.415356 543705 disk_info.go:196] parse disk info done, disk is : [0xc000474180 0xc0004741c0]
E0320 03:35:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:35:23.409875 543705 memory.go:184] no items to output this cycle
I0320 03:35:23.409907 543705 cpu.go:275] no items to output this cycle
E0320 03:35:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:35:33.409799 543705 memory.go:184] no items to output this cycle
I0320 03:35:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 03:35:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:35:43.409795 543705 memory.go:191] Add success.
I0320 03:35:43.409808 543705 cpu.go:282] Add success.
I0320 03:35:43.420005 543705 net.go:648] Add success.
I0320 03:35:43.422574 543705 net.go:770] primary dev: ETH0
I0320 03:35:43.422589 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:35:43.422603 543705 net.go:698] Add success.
I0320 03:35:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:35:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:35:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:35:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:35:53.409779 543705 memory.go:184] no items to output this cycle
I0320 03:35:53.409780 543705 cpu.go:275] no items to output this cycle
E0320 03:36:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:36:03.409782 543705 memory.go:184] no items to output this cycle
I0320 03:36:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 03:36:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:36:13.409802 543705 memory.go:191] Add success.
I0320 03:36:13.409804 543705 cpu.go:282] Add success.
W0320 03:36:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:36:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:36:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:36:13.420068 543705 net.go:648] Add success.
I0320 03:36:13.422714 543705 net.go:770] primary dev: ETH0
I0320 03:36:13.422729 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:36:13.422744 543705 net.go:698] Add success.
I0320 03:36:13.505534 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3691879c-60ca-4b7b-8321-76e542d85a86","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 03:36:13.505566 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 03:36:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:36:14.455221 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:36:14.455232 543705 disk_worker.go:708] disk space is not compliant
W0320 03:36:14.455234 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:36:14.456721 543705 disk_worker.go:494] system disk:vda1
I0320 03:36:14.456750 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:36:15.456020 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:36:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:36:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:36:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:36:16.472398 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:36:22.415800 543705 disk_info.go:125] begin check local disk info of client
I0320 03:36:22.418374 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:36:22.418381 543705 disk_info.go:196] parse disk info done, disk is : [0xc000564080 0xc0005640c0]
E0320 03:36:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:36:23.409764 543705 memory.go:184] no items to output this cycle
I0320 03:36:23.409798 543705 cpu.go:275] no items to output this cycle
E0320 03:36:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:36:33.409780 543705 memory.go:184] no items to output this cycle
I0320 03:36:33.409812 543705 cpu.go:275] no items to output this cycle
I0320 03:36:38.093158 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 03:36:38.093164 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 03:36:43.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:36:43.410673 543705 memory.go:191] Add success.
I0320 03:36:43.409809 543705 cpu.go:282] Add success.
I0320 03:36:43.420455 543705 net.go:648] Add success.
I0320 03:36:43.423164 543705 net.go:770] primary dev: ETH0
I0320 03:36:43.423181 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:36:43.423195 543705 net.go:698] Add success.
I0320 03:36:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:36:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:36:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:36:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:36:53.409799 543705 memory.go:184] no items to output this cycle
I0320 03:36:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 03:37:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:37:03.409791 543705 memory.go:184] no items to output this cycle
I0320 03:37:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 03:37:13.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:37:13.409774 543705 memory.go:191] Add success.
W0320 03:37:13.409799 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 03:37:13.409798 543705 cpu.go:282] Add success.
W0320 03:37:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:37:13.409813 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:37:13.420154 543705 net.go:648] Add success.
I0320 03:37:13.423022 543705 net.go:770] primary dev: ETH0
I0320 03:37:13.423036 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:37:13.423051 543705 net.go:698] Add success.
I0320 03:37:13.453593 543705 event_worker.go:152] Polling the log file for events...
W0320 03:37:14.454429 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:37:14.454530 543705 disk_worker.go:708] disk space is not compliant
W0320 03:37:14.454535 543705 disk_worker.go:728] disk inode is not compliant
E0320 03:37:14.454936 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 03:37:14.454945 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 03:37:14.454951 543705 custom_config.go:64] query custom config with name: gpu
I0320 03:37:14.457081 543705 disk_worker.go:494] system disk:vda1
I0320 03:37:14.457125 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 03:37:15.457032 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 03:37:15.457198 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:37:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 03:37:16.457970 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 03:37:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:37:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:37:16.472472 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:37:22.418787 543705 disk_info.go:125] begin check local disk info of client
I0320 03:37:22.421339 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:37:22.421346 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037c280 0xc00037c2c0]
E0320 03:37:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:37:23.409791 543705 memory.go:184] no items to output this cycle
I0320 03:37:23.409805 543705 cpu.go:275] no items to output this cycle
E0320 03:37:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:37:33.409787 543705 cpu.go:275] no items to output this cycle
I0320 03:37:33.409793 543705 memory.go:184] no items to output this cycle
E0320 03:37:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:37:43.409805 543705 memory.go:191] Add success.
I0320 03:37:43.409815 543705 cpu.go:282] Add success.
I0320 03:37:43.419854 543705 net.go:648] Add success.
I0320 03:37:43.422504 543705 net.go:770] primary dev: ETH0
I0320 03:37:43.422518 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:37:43.422531 543705 net.go:698] Add success.
I0320 03:37:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:37:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:37:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:37:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:37:53.409782 543705 memory.go:184] no items to output this cycle
I0320 03:37:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 03:38:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:38:03.409784 543705 memory.go:184] no items to output this cycle
I0320 03:38:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 03:38:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:38:13.409775 543705 memory.go:191] Add success.
W0320 03:38:13.409801 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 03:38:13.409803 543705 cpu.go:282] Add success.
W0320 03:38:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:38:13.409815 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:38:13.420105 543705 net.go:648] Add success.
I0320 03:38:13.422749 543705 net.go:770] primary dev: ETH0
I0320 03:38:13.422764 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:38:13.422776 543705 net.go:698] Add success.
I0320 03:38:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:38:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:38:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0320 03:38:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:38:14.456586 543705 disk_worker.go:494] system disk:vda1
I0320 03:38:14.456616 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:38:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:38:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:38:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:38:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:38:16.472383 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:38:22.421799 543705 disk_info.go:125] begin check local disk info of client
I0320 03:38:22.424267 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:38:22.424276 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ce000 0xc0003ce040]
E0320 03:38:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:38:23.409783 543705 memory.go:184] no items to output this cycle
I0320 03:38:23.409808 543705 cpu.go:275] no items to output this cycle
E0320 03:38:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:38:33.409787 543705 memory.go:184] no items to output this cycle
I0320 03:38:33.409802 543705 cpu.go:275] no items to output this cycle
E0320 03:38:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:38:43.409818 543705 memory.go:191] Add success.
I0320 03:38:43.409818 543705 cpu.go:282] Add success.
I0320 03:38:43.419895 543705 net.go:648] Add success.
I0320 03:38:43.422438 543705 net.go:770] primary dev: ETH0
I0320 03:38:43.422450 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:38:43.422462 543705 net.go:698] Add success.
I0320 03:38:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:38:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:38:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:38:53.409777 543705 cpu.go:275] no items to output this cycle
E0320 03:38:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:38:53.409795 543705 memory.go:184] no items to output this cycle
E0320 03:39:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:39:03.409778 543705 memory.go:184] no items to output this cycle
I0320 03:39:03.409809 543705 cpu.go:275] no items to output this cycle
E0320 03:39:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:39:13.409787 543705 memory.go:191] Add success.
I0320 03:39:13.409805 543705 cpu.go:282] Add success.
W0320 03:39:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:39:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:39:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:39:13.420190 543705 net.go:648] Add success.
I0320 03:39:13.423368 543705 net.go:770] primary dev: ETH0
I0320 03:39:13.423384 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:39:13.423397 543705 net.go:698] Add success.
I0320 03:39:14.454981 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:39:14.455280 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:39:14.455294 543705 disk_worker.go:708] disk space is not compliant
W0320 03:39:14.455299 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:39:14.457530 543705 disk_worker.go:494] system disk:vda1
I0320 03:39:14.457577 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:39:14.744667 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3ccfd4ee-99fc-4f0d-acbf-df045979eee2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 03:39:14.744714 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 03:39:15.455703 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:39:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:39:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:39:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:39:16.472448 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:39:22.424816 543705 disk_info.go:125] begin check local disk info of client
I0320 03:39:22.427369 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:39:22.427376 543705 disk_info.go:196] parse disk info done, disk is : [0xc000273600 0xc000273640]
E0320 03:39:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:39:23.409790 543705 memory.go:184] no items to output this cycle
I0320 03:39:23.409803 543705 cpu.go:275] no items to output this cycle
E0320 03:39:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:39:33.409782 543705 memory.go:184] no items to output this cycle
I0320 03:39:33.409817 543705 cpu.go:275] no items to output this cycle
I0320 03:39:38.093732 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 03:39:38.093739 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 03:39:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:39:43.410648 543705 memory.go:191] Add success.
I0320 03:39:43.409824 543705 cpu.go:282] Add success.
I0320 03:39:43.420393 543705 net.go:648] Add success.
I0320 03:39:43.422834 543705 net.go:770] primary dev: ETH0
I0320 03:39:43.422846 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:39:43.422859 543705 net.go:698] Add success.
I0320 03:39:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:39:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:39:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:39:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:39:53.409783 543705 memory.go:184] no items to output this cycle
I0320 03:39:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 03:40:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:40:03.409781 543705 memory.go:184] no items to output this cycle
I0320 03:40:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 03:40:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:40:13.409776 543705 memory.go:191] Add success.
W0320 03:40:13.409803 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:40:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:40:13.409817 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:40:13.409821 543705 cpu.go:282] Add success.
I0320 03:40:13.420049 543705 net.go:648] Add success.
I0320 03:40:13.422693 543705 net.go:770] primary dev: ETH0
I0320 03:40:13.422706 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:40:13.422718 543705 net.go:698] Add success.
I0320 03:40:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:40:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:40:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0320 03:40:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:40:14.456593 543705 disk_worker.go:494] system disk:vda1
I0320 03:40:14.456625 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:40:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:40:16.457961 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:40:16.458024 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:40:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:40:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:40:22.427832 543705 disk_info.go:125] begin check local disk info of client
I0320 03:40:22.430307 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:40:22.430314 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a0a80 0xc0002a0ac0]
E0320 03:40:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:40:23.409799 543705 memory.go:184] no items to output this cycle
I0320 03:40:23.409808 543705 cpu.go:275] no items to output this cycle
E0320 03:40:33.409911 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:40:33.409945 543705 cpu.go:275] no items to output this cycle
I0320 03:40:33.409948 543705 memory.go:184] no items to output this cycle
E0320 03:40:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:40:43.409810 543705 memory.go:191] Add success.
I0320 03:40:43.409822 543705 cpu.go:282] Add success.
I0320 03:40:43.419962 543705 net.go:648] Add success.
I0320 03:40:43.422752 543705 net.go:770] primary dev: ETH0
I0320 03:40:43.422765 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:40:43.422776 543705 net.go:698] Add success.
I0320 03:40:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:40:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:40:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:40:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:40:53.409776 543705 memory.go:184] no items to output this cycle
I0320 03:40:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 03:41:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:41:03.409787 543705 memory.go:184] no items to output this cycle
I0320 03:41:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 03:41:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:41:13.409785 543705 memory.go:191] Add success.
I0320 03:41:13.409806 543705 cpu.go:282] Add success.
W0320 03:41:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:41:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:41:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:41:13.420036 543705 net.go:648] Add success.
I0320 03:41:13.422543 543705 net.go:770] primary dev: ETH0
I0320 03:41:13.422557 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:41:13.422568 543705 net.go:698] Add success.
I0320 03:41:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:41:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:41:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0320 03:41:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:41:14.456588 543705 disk_worker.go:494] system disk:vda1
I0320 03:41:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:41:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:41:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:41:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:41:16.458078 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:41:16.472455 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:41:22.430841 543705 disk_info.go:125] begin check local disk info of client
I0320 03:41:22.433432 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:41:22.433439 543705 disk_info.go:196] parse disk info done, disk is : [0xc000266340 0xc000266380]
E0320 03:41:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:41:23.409805 543705 memory.go:184] no items to output this cycle
I0320 03:41:23.409812 543705 cpu.go:275] no items to output this cycle
E0320 03:41:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:41:33.409779 543705 memory.go:184] no items to output this cycle
I0320 03:41:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 03:41:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:41:43.409789 543705 memory.go:191] Add success.
I0320 03:41:43.409817 543705 cpu.go:282] Add success.
I0320 03:41:43.419968 543705 net.go:648] Add success.
I0320 03:41:43.422485 543705 net.go:770] primary dev: ETH0
I0320 03:41:43.422498 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:41:43.422510 543705 net.go:698] Add success.
I0320 03:41:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:41:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:41:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:41:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:41:53.409770 543705 memory.go:184] no items to output this cycle
I0320 03:41:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 03:42:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:42:03.409776 543705 memory.go:184] no items to output this cycle
I0320 03:42:03.409802 543705 cpu.go:275] no items to output this cycle
E0320 03:42:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:42:13.409777 543705 memory.go:191] Add success.
W0320 03:42:13.409802 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 03:42:13.409807 543705 cpu.go:282] Add success.
W0320 03:42:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:42:13.409816 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:42:13.420247 543705 net.go:648] Add success.
I0320 03:42:13.422885 543705 net.go:770] primary dev: ETH0
I0320 03:42:13.422898 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:42:13.422911 543705 net.go:698] Add success.
I0320 03:42:13.659637 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a2349ca2-3528-4b39-856a-e1a93ce7434a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 03:42:13.659671 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 03:42:14.454164 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:42:14.454227 543705 disk_worker.go:708] disk space is not compliant
W0320 03:42:14.454230 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:42:14.456076 543705 disk_worker.go:494] system disk:vda1
E0320 03:42:14.456101 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
I0320 03:42:14.456106 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 03:42:14.456110 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 03:42:14.456115 543705 custom_config.go:64] query custom config with name: gpu
E0320 03:42:15.456875 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 03:42:15.456884 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:42:16.457927 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 03:42:16.457935 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 03:42:16.457978 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:42:16.457996 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:42:16.472322 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:42:22.433869 543705 disk_info.go:125] begin check local disk info of client
I0320 03:42:22.436285 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:42:22.436292 543705 disk_info.go:196] parse disk info done, disk is : [0xc000468b40 0xc000468b80]
E0320 03:42:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:42:23.409792 543705 memory.go:184] no items to output this cycle
I0320 03:42:23.409805 543705 cpu.go:275] no items to output this cycle
E0320 03:42:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:42:33.409763 543705 memory.go:184] no items to output this cycle
I0320 03:42:33.409807 543705 cpu.go:275] no items to output this cycle
I0320 03:42:38.093883 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 03:42:38.093888 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 03:42:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:42:43.410820 543705 memory.go:191] Add success.
I0320 03:42:43.409825 543705 cpu.go:282] Add success.
I0320 03:42:43.420567 543705 net.go:648] Add success.
I0320 03:42:43.423282 543705 net.go:770] primary dev: ETH0
I0320 03:42:43.423301 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:42:43.423315 543705 net.go:698] Add success.
I0320 03:42:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:42:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:42:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:42:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:42:53.409763 543705 memory.go:184] no items to output this cycle
I0320 03:42:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 03:43:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:43:03.409807 543705 memory.go:184] no items to output this cycle
I0320 03:43:03.409823 543705 cpu.go:275] no items to output this cycle
E0320 03:43:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:43:13.409813 543705 memory.go:191] Add success.
I0320 03:43:13.409824 543705 cpu.go:282] Add success.
W0320 03:43:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:43:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:43:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:43:13.420158 543705 net.go:648] Add success.
I0320 03:43:13.422637 543705 net.go:770] primary dev: ETH0
I0320 03:43:13.422650 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:43:13.422662 543705 net.go:698] Add success.
I0320 03:43:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:43:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:43:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0320 03:43:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:43:14.456511 543705 disk_worker.go:494] system disk:vda1
I0320 03:43:14.456557 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:43:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:43:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:43:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:43:16.458074 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:43:16.472460 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:43:22.436879 543705 disk_info.go:125] begin check local disk info of client
I0320 03:43:22.439370 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:43:22.439376 543705 disk_info.go:196] parse disk info done, disk is : [0xc000299340 0xc000299380]
E0320 03:43:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:43:23.409782 543705 memory.go:184] no items to output this cycle
I0320 03:43:23.409797 543705 cpu.go:275] no items to output this cycle
E0320 03:43:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:43:33.409783 543705 cpu.go:275] no items to output this cycle
I0320 03:43:33.409791 543705 memory.go:184] no items to output this cycle
E0320 03:43:43.409886 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:43:43.409926 543705 memory.go:191] Add success.
I0320 03:43:43.409980 543705 cpu.go:282] Add success.
I0320 03:43:43.419731 543705 net.go:648] Add success.
I0320 03:43:43.422418 543705 net.go:770] primary dev: ETH0
I0320 03:43:43.422435 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:43:43.422447 543705 net.go:698] Add success.
I0320 03:43:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:43:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:43:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:43:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:43:53.409776 543705 memory.go:184] no items to output this cycle
I0320 03:43:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 03:44:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:44:03.409784 543705 memory.go:184] no items to output this cycle
I0320 03:44:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 03:44:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:44:13.409793 543705 memory.go:191] Add success.
I0320 03:44:13.409793 543705 cpu.go:282] Add success.
W0320 03:44:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:44:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:44:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:44:13.420142 543705 net.go:648] Add success.
I0320 03:44:13.423231 543705 net.go:770] primary dev: ETH0
I0320 03:44:13.423249 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:44:13.423263 543705 net.go:698] Add success.
I0320 03:44:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:44:14.455152 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:44:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0320 03:44:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:44:14.456494 543705 disk_worker.go:494] system disk:vda1
I0320 03:44:14.456536 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:44:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:44:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:44:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:44:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:44:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:44:22.439895 543705 disk_info.go:125] begin check local disk info of client
I0320 03:44:22.442340 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:44:22.442347 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bf500 0xc0002bf540]
E0320 03:44:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:44:23.409781 543705 memory.go:184] no items to output this cycle
I0320 03:44:23.409798 543705 cpu.go:275] no items to output this cycle
E0320 03:44:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:44:33.409783 543705 memory.go:184] no items to output this cycle
I0320 03:44:33.409787 543705 cpu.go:275] no items to output this cycle
E0320 03:44:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:44:43.409785 543705 memory.go:191] Add success.
I0320 03:44:43.409816 543705 cpu.go:282] Add success.
I0320 03:44:43.419977 543705 net.go:648] Add success.
I0320 03:44:43.422694 543705 net.go:770] primary dev: ETH0
I0320 03:44:43.422706 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:44:43.422718 543705 net.go:698] Add success.
I0320 03:44:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:44:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:44:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:44:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:44:53.409764 543705 memory.go:184] no items to output this cycle
I0320 03:44:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 03:45:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:45:03.409789 543705 memory.go:184] no items to output this cycle
I0320 03:45:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 03:45:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:45:13.409778 543705 memory.go:191] Add success.
W0320 03:45:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 03:45:13.409809 543705 cpu.go:282] Add success.
W0320 03:45:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:45:13.409818 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:45:13.420044 543705 net.go:648] Add success.
I0320 03:45:13.423028 543705 net.go:770] primary dev: ETH0
I0320 03:45:13.423040 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:45:13.423052 543705 net.go:698] Add success.
I0320 03:45:13.553667 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a2ad8fb9-fc15-418a-9263-82a3f7000216","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 03:45:13.553703 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 03:45:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:45:14.455193 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:45:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0320 03:45:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:45:14.456674 543705 disk_worker.go:494] system disk:vda1
I0320 03:45:14.456705 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:45:15.455979 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:45:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:45:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:45:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:45:16.472453 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:45:22.442906 543705 disk_info.go:125] begin check local disk info of client
I0320 03:45:22.445413 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:45:22.445419 543705 disk_info.go:196] parse disk info done, disk is : [0xc000376e80 0xc000376ec0]
E0320 03:45:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:45:23.409787 543705 memory.go:184] no items to output this cycle
I0320 03:45:23.409802 543705 cpu.go:275] no items to output this cycle
E0320 03:45:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:45:33.409768 543705 memory.go:184] no items to output this cycle
I0320 03:45:33.409800 543705 cpu.go:275] no items to output this cycle
I0320 03:45:38.097174 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 03:45:38.097180 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 03:45:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:45:43.410632 543705 memory.go:191] Add success.
I0320 03:45:43.409810 543705 cpu.go:282] Add success.
I0320 03:45:43.420347 543705 net.go:648] Add success.
I0320 03:45:43.422935 543705 net.go:770] primary dev: ETH0
I0320 03:45:43.422948 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:45:43.422961 543705 net.go:698] Add success.
I0320 03:45:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:45:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:45:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:45:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:45:53.409772 543705 memory.go:184] no items to output this cycle
I0320 03:45:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 03:46:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:46:03.409804 543705 memory.go:184] no items to output this cycle
I0320 03:46:03.409815 543705 cpu.go:275] no items to output this cycle
E0320 03:46:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:46:13.409785 543705 memory.go:191] Add success.
I0320 03:46:13.409786 543705 cpu.go:282] Add success.
W0320 03:46:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:46:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:46:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:46:13.420075 543705 net.go:648] Add success.
I0320 03:46:13.422871 543705 net.go:770] primary dev: ETH0
I0320 03:46:13.422884 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:46:13.422896 543705 net.go:698] Add success.
I0320 03:46:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:46:14.455208 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:46:14.455221 543705 disk_worker.go:708] disk space is not compliant
W0320 03:46:14.455223 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:46:14.456606 543705 disk_worker.go:494] system disk:vda1
I0320 03:46:14.456638 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:46:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:46:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:46:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:46:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:46:16.472406 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:46:22.445924 543705 disk_info.go:125] begin check local disk info of client
I0320 03:46:22.448485 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:46:22.448492 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab280 0xc0001ab2c0]
E0320 03:46:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:46:23.409774 543705 cpu.go:275] no items to output this cycle
I0320 03:46:23.409781 543705 memory.go:184] no items to output this cycle
E0320 03:46:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:46:33.409784 543705 memory.go:184] no items to output this cycle
I0320 03:46:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 03:46:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:46:43.409781 543705 memory.go:191] Add success.
I0320 03:46:43.409802 543705 cpu.go:282] Add success.
I0320 03:46:43.420033 543705 net.go:648] Add success.
I0320 03:46:43.422743 543705 net.go:770] primary dev: ETH0
I0320 03:46:43.422758 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:46:43.422773 543705 net.go:698] Add success.
I0320 03:46:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:46:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:46:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:46:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:46:53.409767 543705 memory.go:184] no items to output this cycle
I0320 03:46:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 03:47:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:47:03.409781 543705 memory.go:184] no items to output this cycle
I0320 03:47:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 03:47:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:47:13.409810 543705 memory.go:191] Add success.
I0320 03:47:13.409814 543705 cpu.go:282] Add success.
W0320 03:47:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:47:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:47:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:47:13.420244 543705 net.go:648] Add success.
I0320 03:47:13.422850 543705 net.go:770] primary dev: ETH0
I0320 03:47:13.422865 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:47:13.422877 543705 net.go:698] Add success.
I0320 03:47:13.453411 543705 event_worker.go:152] Polling the log file for events...
W0320 03:47:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:47:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 03:47:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:47:14.456783 543705 disk_worker.go:494] system disk:vda1
I0320 03:47:14.456823 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 03:47:14.457147 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 03:47:14.457154 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 03:47:14.457159 543705 custom_config.go:64] query custom config with name: gpu
E0320 03:47:15.456976 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 03:47:15.456991 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:47:16.457998 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 03:47:16.457998 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 03:47:16.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:47:16.458087 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:47:16.472527 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:47:22.448939 543705 disk_info.go:125] begin check local disk info of client
I0320 03:47:22.451465 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:47:22.451471 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000edd80 0xc0000eddc0]
E0320 03:47:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:47:23.409792 543705 memory.go:184] no items to output this cycle
I0320 03:47:23.409804 543705 cpu.go:275] no items to output this cycle
E0320 03:47:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:47:33.409773 543705 memory.go:184] no items to output this cycle
I0320 03:47:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 03:47:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:47:43.409787 543705 memory.go:191] Add success.
I0320 03:47:43.409798 543705 cpu.go:282] Add success.
I0320 03:47:43.419837 543705 net.go:648] Add success.
I0320 03:47:43.422465 543705 net.go:770] primary dev: ETH0
I0320 03:47:43.422479 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:47:43.422492 543705 net.go:698] Add success.
I0320 03:47:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:47:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:47:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:47:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:47:53.409795 543705 memory.go:184] no items to output this cycle
I0320 03:47:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 03:48:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:48:03.409786 543705 memory.go:184] no items to output this cycle
I0320 03:48:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 03:48:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:48:13.409788 543705 memory.go:191] Add success.
I0320 03:48:13.409791 543705 cpu.go:282] Add success.
W0320 03:48:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:48:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:48:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:48:13.420205 543705 net.go:648] Add success.
I0320 03:48:13.423162 543705 net.go:770] primary dev: ETH0
I0320 03:48:13.423175 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:48:13.423188 543705 net.go:698] Add success.
I0320 03:48:13.469235 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"db7c55b8-727e-4966-a8a7-36ecce4b6cb1","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 03:48:13.469274 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 03:48:14.454981 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:48:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:48:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0320 03:48:14.455180 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:48:14.456540 543705 disk_worker.go:494] system disk:vda1
I0320 03:48:14.456595 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:48:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:48:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:48:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:48:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:48:16.472429 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:48:22.451955 543705 disk_info.go:125] begin check local disk info of client
I0320 03:48:22.454462 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:48:22.454468 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4a40 0xc0000c4a80]
E0320 03:48:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:48:23.409783 543705 memory.go:184] no items to output this cycle
I0320 03:48:23.409787 543705 cpu.go:275] no items to output this cycle
E0320 03:48:33.409881 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:48:33.409899 543705 memory.go:184] no items to output this cycle
I0320 03:48:33.409902 543705 cpu.go:275] no items to output this cycle
I0320 03:48:38.097746 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 03:48:38.097753 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 03:48:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:48:43.410605 543705 memory.go:191] Add success.
I0320 03:48:43.409823 543705 cpu.go:282] Add success.
I0320 03:48:43.420317 543705 net.go:648] Add success.
I0320 03:48:43.422816 543705 net.go:770] primary dev: ETH0
I0320 03:48:43.422829 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:48:43.422842 543705 net.go:698] Add success.
I0320 03:48:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:48:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:48:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:48:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:48:53.409794 543705 memory.go:184] no items to output this cycle
I0320 03:48:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 03:49:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:49:03.409783 543705 memory.go:184] no items to output this cycle
I0320 03:49:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 03:49:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:49:13.409786 543705 memory.go:191] Add success.
I0320 03:49:13.409788 543705 cpu.go:282] Add success.
W0320 03:49:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:49:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:49:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:49:13.420072 543705 net.go:648] Add success.
I0320 03:49:13.422795 543705 net.go:770] primary dev: ETH0
I0320 03:49:13.422809 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:49:13.422823 543705 net.go:698] Add success.
I0320 03:49:14.454981 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:49:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:49:14.455211 543705 disk_worker.go:708] disk space is not compliant
W0320 03:49:14.455214 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:49:14.456590 543705 disk_worker.go:494] system disk:vda1
I0320 03:49:14.456619 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:49:15.455984 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:49:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:49:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:49:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:49:16.472483 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:49:22.454972 543705 disk_info.go:125] begin check local disk info of client
I0320 03:49:22.457583 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:49:22.457589 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003534c0 0xc000353500]
E0320 03:49:23.410300 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:49:23.410315 543705 memory.go:184] no items to output this cycle
I0320 03:49:23.410434 543705 cpu.go:275] no items to output this cycle
E0320 03:49:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:49:33.409777 543705 memory.go:184] no items to output this cycle
I0320 03:49:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 03:49:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:49:43.409787 543705 memory.go:191] Add success.
I0320 03:49:43.409802 543705 cpu.go:282] Add success.
I0320 03:49:43.420293 543705 net.go:648] Add success.
I0320 03:49:43.423552 543705 net.go:770] primary dev: ETH0
I0320 03:49:43.423566 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:49:43.423579 543705 net.go:698] Add success.
I0320 03:49:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:49:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:49:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:49:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:49:53.409794 543705 memory.go:184] no items to output this cycle
I0320 03:49:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 03:50:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:50:03.409772 543705 memory.go:184] no items to output this cycle
I0320 03:50:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 03:50:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:50:13.409789 543705 memory.go:191] Add success.
I0320 03:50:13.409793 543705 cpu.go:282] Add success.
W0320 03:50:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:50:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:50:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:50:13.420035 543705 net.go:648] Add success.
I0320 03:50:13.422834 543705 net.go:770] primary dev: ETH0
I0320 03:50:13.422847 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:50:13.422859 543705 net.go:698] Add success.
I0320 03:50:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:50:14.455130 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:50:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0320 03:50:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:50:14.456628 543705 disk_worker.go:494] system disk:vda1
I0320 03:50:14.456659 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:50:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:50:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:50:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:50:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:50:16.472417 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:50:22.457974 543705 disk_info.go:125] begin check local disk info of client
I0320 03:50:22.460483 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:50:22.460488 543705 disk_info.go:196] parse disk info done, disk is : [0xc000375c40 0xc000375c80]
E0320 03:50:23.409859 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:50:23.409890 543705 memory.go:184] no items to output this cycle
I0320 03:50:23.410039 543705 cpu.go:275] no items to output this cycle
E0320 03:50:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:50:33.409797 543705 memory.go:184] no items to output this cycle
I0320 03:50:33.409814 543705 cpu.go:275] no items to output this cycle
E0320 03:50:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:50:43.409795 543705 memory.go:191] Add success.
I0320 03:50:43.409796 543705 cpu.go:282] Add success.
I0320 03:50:43.419980 543705 net.go:648] Add success.
I0320 03:50:43.422850 543705 net.go:770] primary dev: ETH0
I0320 03:50:43.422866 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:50:43.422880 543705 net.go:698] Add success.
I0320 03:50:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:50:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:50:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:50:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:50:53.409777 543705 memory.go:184] no items to output this cycle
I0320 03:50:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 03:51:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:51:03.409810 543705 memory.go:184] no items to output this cycle
I0320 03:51:03.409822 543705 cpu.go:275] no items to output this cycle
E0320 03:51:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:51:13.409783 543705 memory.go:191] Add success.
W0320 03:51:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 03:51:13.409816 543705 cpu.go:282] Add success.
W0320 03:51:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:51:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:51:13.420127 543705 net.go:648] Add success.
I0320 03:51:13.423087 543705 net.go:770] primary dev: ETH0
I0320 03:51:13.423102 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:51:13.423114 543705 net.go:698] Add success.
I0320 03:51:13.469244 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ea6daa3a-7c2d-43a8-a516-575afaf869b9","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 03:51:13.469286 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 03:51:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:51:14.455107 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:51:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 03:51:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:51:14.456520 543705 disk_worker.go:494] system disk:vda1
I0320 03:51:14.456567 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:51:15.456028 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:51:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:51:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:51:16.458076 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:51:16.472471 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:51:22.461001 543705 disk_info.go:125] begin check local disk info of client
I0320 03:51:22.463578 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:51:22.463584 543705 disk_info.go:196] parse disk info done, disk is : [0xc000374080 0xc0003740c0]
E0320 03:51:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:51:23.409789 543705 memory.go:184] no items to output this cycle
I0320 03:51:23.409802 543705 cpu.go:275] no items to output this cycle
E0320 03:51:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:51:33.409795 543705 memory.go:184] no items to output this cycle
I0320 03:51:33.409807 543705 cpu.go:275] no items to output this cycle
I0320 03:51:38.101193 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 03:51:38.101200 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 03:51:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:51:43.410784 543705 memory.go:191] Add success.
I0320 03:51:43.409806 543705 cpu.go:282] Add success.
I0320 03:51:43.420561 543705 net.go:648] Add success.
I0320 03:51:43.423206 543705 net.go:770] primary dev: ETH0
I0320 03:51:43.423218 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:51:43.423231 543705 net.go:698] Add success.
I0320 03:51:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:51:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:51:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:51:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:51:53.409778 543705 memory.go:184] no items to output this cycle
I0320 03:51:53.409783 543705 cpu.go:275] no items to output this cycle
E0320 03:52:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:52:03.409784 543705 memory.go:184] no items to output this cycle
I0320 03:52:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 03:52:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:52:13.409819 543705 memory.go:191] Add success.
I0320 03:52:13.409824 543705 cpu.go:282] Add success.
W0320 03:52:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:52:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:52:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:52:13.420149 543705 net.go:648] Add success.
I0320 03:52:13.422813 543705 net.go:770] primary dev: ETH0
I0320 03:52:13.422834 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:52:13.422850 543705 net.go:698] Add success.
W0320 03:52:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:52:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 03:52:14.455188 543705 disk_worker.go:728] disk inode is not compliant
E0320 03:52:14.455876 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 03:52:14.455885 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 03:52:14.455891 543705 custom_config.go:64] query custom config with name: gpu
I0320 03:52:14.456645 543705 disk_worker.go:494] system disk:vda1
I0320 03:52:14.456693 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 03:52:15.456779 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 03:52:15.456788 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:52:16.457928 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 03:52:16.457928 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 03:52:16.457982 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:52:16.458001 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:52:16.472404 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:52:22.464007 543705 disk_info.go:125] begin check local disk info of client
I0320 03:52:22.466579 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:52:22.466587 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c43c0 0xc0000c4400]
E0320 03:52:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:52:23.409765 543705 memory.go:184] no items to output this cycle
I0320 03:52:23.409803 543705 cpu.go:275] no items to output this cycle
E0320 03:52:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:52:33.409779 543705 memory.go:184] no items to output this cycle
I0320 03:52:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 03:52:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:52:43.409790 543705 memory.go:191] Add success.
I0320 03:52:43.409792 543705 cpu.go:282] Add success.
I0320 03:52:43.419928 543705 net.go:648] Add success.
I0320 03:52:43.422438 543705 net.go:770] primary dev: ETH0
I0320 03:52:43.422451 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:52:43.422463 543705 net.go:698] Add success.
I0320 03:52:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:52:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:52:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:52:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:52:53.409800 543705 memory.go:184] no items to output this cycle
I0320 03:52:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 03:53:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:53:03.409804 543705 memory.go:184] no items to output this cycle
I0320 03:53:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 03:53:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:53:13.409790 543705 memory.go:191] Add success.
I0320 03:53:13.409812 543705 cpu.go:282] Add success.
W0320 03:53:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:53:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:53:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:53:13.420103 543705 net.go:648] Add success.
I0320 03:53:13.423036 543705 net.go:770] primary dev: ETH0
I0320 03:53:13.423049 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:53:13.423061 543705 net.go:698] Add success.
I0320 03:53:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:53:14.455197 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:53:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0320 03:53:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:53:14.456609 543705 disk_worker.go:494] system disk:vda1
I0320 03:53:14.456640 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:53:15.455987 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:53:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:53:16.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:53:16.458093 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:53:16.472491 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:53:22.467033 543705 disk_info.go:125] begin check local disk info of client
I0320 03:53:22.469614 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:53:22.469621 543705 disk_info.go:196] parse disk info done, disk is : [0xc000260000 0xc000260040]
E0320 03:53:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:53:23.409781 543705 memory.go:184] no items to output this cycle
I0320 03:53:23.409781 543705 cpu.go:275] no items to output this cycle
E0320 03:53:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:53:33.409807 543705 memory.go:184] no items to output this cycle
I0320 03:53:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 03:53:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:53:43.409813 543705 memory.go:191] Add success.
I0320 03:53:43.409829 543705 cpu.go:282] Add success.
I0320 03:53:43.419957 543705 net.go:648] Add success.
I0320 03:53:43.422874 543705 net.go:770] primary dev: ETH0
I0320 03:53:43.422886 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:53:43.422898 543705 net.go:698] Add success.
I0320 03:53:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:53:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:53:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:53:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:53:53.409774 543705 memory.go:184] no items to output this cycle
I0320 03:53:53.409777 543705 cpu.go:275] no items to output this cycle
E0320 03:54:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:54:03.409805 543705 memory.go:184] no items to output this cycle
I0320 03:54:03.409815 543705 cpu.go:275] no items to output this cycle
E0320 03:54:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:54:13.409795 543705 memory.go:191] Add success.
I0320 03:54:13.409798 543705 cpu.go:282] Add success.
W0320 03:54:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:54:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:54:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:54:13.420061 543705 net.go:648] Add success.
I0320 03:54:13.422993 543705 net.go:770] primary dev: ETH0
I0320 03:54:13.423009 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:54:13.423024 543705 net.go:698] Add success.
I0320 03:54:13.464394 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f0a17ad1-1525-49c8-af71-92d014daa3ca","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 03:54:13.464427 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 03:54:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:54:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:54:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0320 03:54:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:54:14.456543 543705 disk_worker.go:494] system disk:vda1
I0320 03:54:14.456587 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:54:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:54:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:54:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:54:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:54:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:54:22.470036 543705 disk_info.go:125] begin check local disk info of client
I0320 03:54:22.472482 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:54:22.472488 543705 disk_info.go:196] parse disk info done, disk is : [0xc000364300 0xc000364340]
E0320 03:54:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:54:23.409787 543705 memory.go:184] no items to output this cycle
I0320 03:54:23.409802 543705 cpu.go:275] no items to output this cycle
E0320 03:54:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:54:33.409807 543705 memory.go:184] no items to output this cycle
I0320 03:54:33.409819 543705 cpu.go:275] no items to output this cycle
I0320 03:54:38.101733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 03:54:38.101740 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 03:54:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:54:43.410685 543705 memory.go:191] Add success.
I0320 03:54:43.409801 543705 cpu.go:282] Add success.
I0320 03:54:43.420432 543705 net.go:648] Add success.
I0320 03:54:43.422929 543705 net.go:770] primary dev: ETH0
I0320 03:54:43.422943 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:54:43.422956 543705 net.go:698] Add success.
I0320 03:54:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:54:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:54:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:54:53.410403 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:54:53.410425 543705 memory.go:184] no items to output this cycle
I0320 03:54:53.410431 543705 cpu.go:275] no items to output this cycle
E0320 03:55:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:55:03.409806 543705 memory.go:184] no items to output this cycle
I0320 03:55:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 03:55:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:55:13.409791 543705 memory.go:191] Add success.
I0320 03:55:13.409796 543705 cpu.go:282] Add success.
W0320 03:55:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:55:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:55:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:55:13.420044 543705 net.go:648] Add success.
I0320 03:55:13.422727 543705 net.go:770] primary dev: ETH0
I0320 03:55:13.422739 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:55:13.422752 543705 net.go:698] Add success.
I0320 03:55:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:55:14.455198 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:55:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0320 03:55:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:55:14.456623 543705 disk_worker.go:494] system disk:vda1
I0320 03:55:14.456654 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:55:15.455992 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:55:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:55:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:55:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:55:16.472460 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:55:22.473063 543705 disk_info.go:125] begin check local disk info of client
I0320 03:55:22.475612 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:55:22.475618 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035a340 0xc00035a380]
E0320 03:55:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:55:23.409791 543705 memory.go:184] no items to output this cycle
I0320 03:55:23.409802 543705 cpu.go:275] no items to output this cycle
E0320 03:55:33.409910 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:55:33.409912 543705 cpu.go:275] no items to output this cycle
I0320 03:55:33.409928 543705 memory.go:184] no items to output this cycle
E0320 03:55:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:55:43.409780 543705 memory.go:191] Add success.
I0320 03:55:43.409813 543705 cpu.go:282] Add success.
I0320 03:55:43.419902 543705 net.go:648] Add success.
I0320 03:55:43.422764 543705 net.go:770] primary dev: ETH0
I0320 03:55:43.422776 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:55:43.422789 543705 net.go:698] Add success.
I0320 03:55:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:55:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:55:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:55:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:55:53.409777 543705 cpu.go:275] no items to output this cycle
I0320 03:55:53.409788 543705 memory.go:184] no items to output this cycle
E0320 03:56:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:56:03.409795 543705 memory.go:184] no items to output this cycle
I0320 03:56:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 03:56:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:56:13.409795 543705 memory.go:191] Add success.
I0320 03:56:13.409800 543705 cpu.go:282] Add success.
W0320 03:56:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:56:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:56:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:56:13.420116 543705 net.go:648] Add success.
I0320 03:56:13.422834 543705 net.go:770] primary dev: ETH0
I0320 03:56:13.422849 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:56:13.422863 543705 net.go:698] Add success.
I0320 03:56:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:56:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:56:14.455216 543705 disk_worker.go:708] disk space is not compliant
W0320 03:56:14.455219 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:56:14.456613 543705 disk_worker.go:494] system disk:vda1
I0320 03:56:14.456644 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:56:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:56:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:56:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:56:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:56:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:56:22.476076 543705 disk_info.go:125] begin check local disk info of client
I0320 03:56:22.478635 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:56:22.478641 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4a00 0xc0000c4a40]
E0320 03:56:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:56:23.409801 543705 memory.go:184] no items to output this cycle
I0320 03:56:23.409812 543705 cpu.go:275] no items to output this cycle
E0320 03:56:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:56:33.409765 543705 memory.go:184] no items to output this cycle
I0320 03:56:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 03:56:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:56:43.409817 543705 memory.go:191] Add success.
I0320 03:56:43.409824 543705 cpu.go:282] Add success.
I0320 03:56:43.420003 543705 net.go:648] Add success.
I0320 03:56:43.423049 543705 net.go:770] primary dev: ETH0
I0320 03:56:43.423061 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:56:43.423073 543705 net.go:698] Add success.
I0320 03:56:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:56:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:56:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:56:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:56:53.409771 543705 memory.go:184] no items to output this cycle
I0320 03:56:53.409776 543705 cpu.go:275] no items to output this cycle
E0320 03:57:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:57:03.409782 543705 memory.go:184] no items to output this cycle
I0320 03:57:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 03:57:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:57:13.409795 543705 cpu.go:282] Add success.
I0320 03:57:13.409800 543705 memory.go:191] Add success.
W0320 03:57:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:57:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:57:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:57:13.420194 543705 net.go:648] Add success.
I0320 03:57:13.429714 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 03:57:13.429789 543705 net.go:770] primary dev: ETH0
I0320 03:57:13.429802 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:57:13.429813 543705 net.go:698] Add success.
I0320 03:57:13.453463 543705 event_worker.go:152] Polling the log file for events...
I0320 03:57:13.470047 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"07003181-9047-4c9d-af5e-745ef109bae2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 03:57:13.470079 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 03:57:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:57:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 03:57:14.455185 543705 disk_worker.go:728] disk inode is not compliant
E0320 03:57:14.456135 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 03:57:14.456144 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 03:57:14.456150 543705 custom_config.go:64] query custom config with name: gpu
I0320 03:57:14.456501 543705 disk_worker.go:494] system disk:vda1
I0320 03:57:14.456548 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 03:57:15.456553 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 03:57:15.456568 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:57:16.458095 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 03:57:16.458123 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 03:57:16.458157 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:57:16.458178 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:57:16.472537 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:57:22.479085 543705 disk_info.go:125] begin check local disk info of client
I0320 03:57:22.481668 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:57:22.481675 543705 disk_info.go:196] parse disk info done, disk is : [0xc00024d6c0 0xc00024d700]
E0320 03:57:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:57:23.409775 543705 cpu.go:275] no items to output this cycle
I0320 03:57:23.409779 543705 memory.go:184] no items to output this cycle
E0320 03:57:33.409872 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:57:33.409896 543705 memory.go:184] no items to output this cycle
I0320 03:57:33.409973 543705 cpu.go:275] no items to output this cycle
I0320 03:57:38.101886 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 03:57:38.101891 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 03:57:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:57:43.410579 543705 memory.go:191] Add success.
I0320 03:57:43.409801 543705 cpu.go:282] Add success.
I0320 03:57:43.420277 543705 net.go:648] Add success.
I0320 03:57:43.422864 543705 net.go:770] primary dev: ETH0
I0320 03:57:43.422879 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:57:43.422893 543705 net.go:698] Add success.
I0320 03:57:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:57:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:57:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:57:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:57:53.409777 543705 memory.go:184] no items to output this cycle
I0320 03:57:53.409778 543705 cpu.go:275] no items to output this cycle
E0320 03:58:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:58:03.409785 543705 memory.go:184] no items to output this cycle
I0320 03:58:03.409786 543705 cpu.go:275] no items to output this cycle
W0320 03:58:13.409711 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:58:13.409730 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:58:13.409736 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:58:13.409797 543705 cpu.go:282] Add success.
E0320 03:58:13.409838 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:58:13.409864 543705 memory.go:191] Add success.
I0320 03:58:13.420056 543705 net.go:648] Add success.
I0320 03:58:13.422580 543705 net.go:770] primary dev: ETH0
I0320 03:58:13.422594 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:58:13.422607 543705 net.go:698] Add success.
I0320 03:58:14.454952 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:58:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:58:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0320 03:58:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:58:14.456583 543705 disk_worker.go:494] system disk:vda1
I0320 03:58:14.456613 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:58:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:58:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:58:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:58:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:58:16.472401 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:58:22.481759 543705 disk_info.go:125] begin check local disk info of client
I0320 03:58:22.484244 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:58:22.484254 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000edf40 0xc0004fc000]
E0320 03:58:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:58:23.409769 543705 memory.go:184] no items to output this cycle
I0320 03:58:23.409795 543705 cpu.go:275] no items to output this cycle
E0320 03:58:33.409888 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:58:33.409912 543705 memory.go:184] no items to output this cycle
I0320 03:58:33.410114 543705 cpu.go:275] no items to output this cycle
E0320 03:58:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:58:43.409780 543705 memory.go:191] Add success.
I0320 03:58:43.409811 543705 cpu.go:282] Add success.
I0320 03:58:43.419851 543705 net.go:648] Add success.
I0320 03:58:43.422582 543705 net.go:770] primary dev: ETH0
I0320 03:58:43.422596 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:58:43.422607 543705 net.go:698] Add success.
I0320 03:58:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:58:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:58:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:58:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:58:53.409768 543705 memory.go:184] no items to output this cycle
I0320 03:58:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 03:59:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:59:03.409780 543705 memory.go:184] no items to output this cycle
I0320 03:59:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 03:59:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:59:13.409796 543705 memory.go:191] Add success.
I0320 03:59:13.409797 543705 cpu.go:282] Add success.
W0320 03:59:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 03:59:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 03:59:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 03:59:13.420596 543705 net.go:648] Add success.
I0320 03:59:13.423611 543705 net.go:770] primary dev: ETH0
I0320 03:59:13.423625 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:59:13.423636 543705 net.go:698] Add success.
I0320 03:59:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 03:59:14.455124 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 03:59:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 03:59:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0320 03:59:14.456630 543705 disk_worker.go:494] system disk:vda1
I0320 03:59:14.456659 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 03:59:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 03:59:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:59:16.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:59:16.458085 543705 gpu_kunlun.go:227] Add success, len:1
I0320 03:59:16.472417 543705 disk_local_worker.go:436] Get disk info: []
I0320 03:59:22.485117 543705 disk_info.go:125] begin check local disk info of client
I0320 03:59:22.487707 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 03:59:22.487714 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a39c0 0xc0002a3a00]
E0320 03:59:23.409735 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:59:23.409748 543705 memory.go:184] no items to output this cycle
I0320 03:59:23.409796 543705 cpu.go:275] no items to output this cycle
E0320 03:59:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:59:33.409795 543705 memory.go:184] no items to output this cycle
I0320 03:59:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 03:59:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:59:43.409825 543705 memory.go:191] Add success.
I0320 03:59:43.409833 543705 cpu.go:282] Add success.
I0320 03:59:43.419976 543705 net.go:648] Add success.
I0320 03:59:43.423015 543705 net.go:770] primary dev: ETH0
I0320 03:59:43.423030 543705 net.go:802] Send network stats successfully!,count is 6
I0320 03:59:43.423045 543705 net.go:698] Add success.
I0320 03:59:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 03:59:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 03:59:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 03:59:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 03:59:53.409777 543705 memory.go:184] no items to output this cycle
I0320 03:59:53.409782 543705 cpu.go:275] no items to output this cycle
E0320 04:00:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:00:03.409787 543705 memory.go:184] no items to output this cycle
I0320 04:00:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 04:00:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:00:13.409782 543705 memory.go:191] Add success.
W0320 04:00:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 04:00:13.409810 543705 cpu.go:282] Add success.
W0320 04:00:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:00:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:00:13.420057 543705 net.go:648] Add success.
I0320 04:00:13.422726 543705 net.go:770] primary dev: ETH0
I0320 04:00:13.422742 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:00:13.422756 543705 net.go:698] Add success.
I0320 04:00:13.468517 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1293f6e7-f7a0-4bd2-b118-fb0bdaaa9470","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 04:00:13.468558 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 04:00:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:00:14.455119 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:00:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 04:00:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:00:14.456590 543705 disk_worker.go:494] system disk:vda1
I0320 04:00:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:00:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:00:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:00:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:00:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:00:16.472401 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:00:22.487799 543705 disk_info.go:125] begin check local disk info of client
I0320 04:00:22.490355 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:00:22.490362 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001abf00 0xc0001abf40]
E0320 04:00:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:00:23.409759 543705 memory.go:184] no items to output this cycle
I0320 04:00:23.409792 543705 cpu.go:275] no items to output this cycle
E0320 04:00:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:00:33.409770 543705 memory.go:184] no items to output this cycle
I0320 04:00:33.409783 543705 cpu.go:275] no items to output this cycle
I0320 04:00:38.105221 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 04:00:38.105227 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 04:00:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:00:43.410704 543705 memory.go:191] Add success.
I0320 04:00:43.409803 543705 cpu.go:282] Add success.
I0320 04:00:43.420434 543705 net.go:648] Add success.
I0320 04:00:43.422966 543705 net.go:770] primary dev: ETH0
I0320 04:00:43.422979 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:00:43.422992 543705 net.go:698] Add success.
I0320 04:00:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:00:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:00:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:00:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:00:53.409793 543705 memory.go:184] no items to output this cycle
I0320 04:00:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 04:01:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:01:03.409777 543705 memory.go:184] no items to output this cycle
I0320 04:01:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 04:01:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:01:13.409816 543705 memory.go:191] Add success.
I0320 04:01:13.409827 543705 cpu.go:282] Add success.
W0320 04:01:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:01:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:01:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:01:13.420163 543705 net.go:648] Add success.
I0320 04:01:13.422875 543705 net.go:770] primary dev: ETH0
I0320 04:01:13.422888 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:01:13.422900 543705 net.go:698] Add success.
I0320 04:01:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:01:14.455132 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:01:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0320 04:01:14.455217 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:01:14.456611 543705 disk_worker.go:494] system disk:vda1
I0320 04:01:14.456641 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:01:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:01:16.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:01:16.458065 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:01:16.458092 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:01:16.472458 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:01:22.491155 543705 disk_info.go:125] begin check local disk info of client
I0320 04:01:22.493621 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:01:22.493628 543705 disk_info.go:196] parse disk info done, disk is : [0xc000266340 0xc000266380]
E0320 04:01:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:01:23.409792 543705 memory.go:184] no items to output this cycle
I0320 04:01:23.409807 543705 cpu.go:275] no items to output this cycle
E0320 04:01:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:01:33.409780 543705 cpu.go:275] no items to output this cycle
I0320 04:01:33.409781 543705 memory.go:184] no items to output this cycle
E0320 04:01:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:01:43.409785 543705 memory.go:191] Add success.
I0320 04:01:43.409803 543705 cpu.go:282] Add success.
I0320 04:01:43.419956 543705 net.go:648] Add success.
I0320 04:01:43.422364 543705 net.go:770] primary dev: ETH0
I0320 04:01:43.422379 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:01:43.422394 543705 net.go:698] Add success.
I0320 04:01:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:01:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:01:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:01:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:01:53.409787 543705 memory.go:184] no items to output this cycle
I0320 04:01:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 04:02:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:02:03.409817 543705 memory.go:184] no items to output this cycle
I0320 04:02:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 04:02:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:02:13.409816 543705 memory.go:191] Add success.
I0320 04:02:13.409820 543705 cpu.go:282] Add success.
W0320 04:02:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:02:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:02:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:02:13.420176 543705 net.go:648] Add success.
I0320 04:02:13.422806 543705 net.go:770] primary dev: ETH0
I0320 04:02:13.422819 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:02:13.422832 543705 net.go:698] Add success.
W0320 04:02:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:02:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 04:02:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:02:14.456764 543705 disk_worker.go:494] system disk:vda1
I0320 04:02:14.456804 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 04:02:14.457136 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 04:02:14.457144 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 04:02:14.457149 543705 custom_config.go:64] query custom config with name: gpu
E0320 04:02:15.456807 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 04:02:15.456816 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:02:16.457902 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 04:02:16.457901 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 04:02:16.457957 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:02:16.457976 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:02:16.472323 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:02:22.494164 543705 disk_info.go:125] begin check local disk info of client
I0320 04:02:22.496602 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:02:22.496609 543705 disk_info.go:196] parse disk info done, disk is : [0xc000253240 0xc000253280]
E0320 04:02:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:02:23.409781 543705 memory.go:184] no items to output this cycle
I0320 04:02:23.409805 543705 cpu.go:275] no items to output this cycle
E0320 04:02:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:02:33.409792 543705 memory.go:184] no items to output this cycle
I0320 04:02:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 04:02:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:02:43.409818 543705 memory.go:191] Add success.
I0320 04:02:43.409826 543705 cpu.go:282] Add success.
I0320 04:02:43.420077 543705 net.go:648] Add success.
I0320 04:02:43.423568 543705 net.go:770] primary dev: ETH0
I0320 04:02:43.423582 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:02:43.423596 543705 net.go:698] Add success.
I0320 04:02:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:02:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:02:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:02:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:02:53.409772 543705 cpu.go:275] no items to output this cycle
I0320 04:02:53.409781 543705 memory.go:184] no items to output this cycle
E0320 04:03:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:03:03.409806 543705 memory.go:184] no items to output this cycle
I0320 04:03:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 04:03:13.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:03:13.409776 543705 memory.go:191] Add success.
W0320 04:03:13.409801 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 04:03:13.409805 543705 cpu.go:282] Add success.
W0320 04:03:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:03:13.409816 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:03:13.420148 543705 net.go:648] Add success.
I0320 04:03:13.422878 543705 net.go:770] primary dev: ETH0
I0320 04:03:13.422892 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:03:13.422904 543705 net.go:698] Add success.
I0320 04:03:13.464086 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b4b8a64a-5713-4e5f-8a85-312427ea552b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 04:03:13.464121 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 04:03:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:03:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:03:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 04:03:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:03:14.456500 543705 disk_worker.go:494] system disk:vda1
I0320 04:03:14.456542 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:03:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:03:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:03:16.458074 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:03:16.458106 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:03:16.472527 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:03:22.497177 543705 disk_info.go:125] begin check local disk info of client
I0320 04:03:22.499847 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:03:22.499853 543705 disk_info.go:196] parse disk info done, disk is : [0xc00024c200 0xc00024c240]
E0320 04:03:23.409737 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:03:23.409752 543705 memory.go:184] no items to output this cycle
I0320 04:03:23.409794 543705 cpu.go:275] no items to output this cycle
E0320 04:03:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:03:33.409784 543705 memory.go:184] no items to output this cycle
I0320 04:03:33.409787 543705 cpu.go:275] no items to output this cycle
I0320 04:03:38.105733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 04:03:38.105740 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 04:03:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:03:43.410583 543705 memory.go:191] Add success.
I0320 04:03:43.409804 543705 cpu.go:282] Add success.
I0320 04:03:43.420330 543705 net.go:648] Add success.
I0320 04:03:43.422772 543705 net.go:770] primary dev: ETH0
I0320 04:03:43.422785 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:03:43.422799 543705 net.go:698] Add success.
I0320 04:03:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:03:46.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:03:46.458055 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:03:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:03:53.409798 543705 memory.go:184] no items to output this cycle
I0320 04:03:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 04:04:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:04:03.409785 543705 memory.go:184] no items to output this cycle
I0320 04:04:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 04:04:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:04:13.409784 543705 memory.go:191] Add success.
I0320 04:04:13.409805 543705 cpu.go:282] Add success.
W0320 04:04:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:04:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:04:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:04:13.420131 543705 net.go:648] Add success.
I0320 04:04:13.422798 543705 net.go:770] primary dev: ETH0
I0320 04:04:13.422811 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:04:13.422822 543705 net.go:698] Add success.
I0320 04:04:14.454955 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:04:14.455134 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:04:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0320 04:04:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:04:14.456602 543705 disk_worker.go:494] system disk:vda1
I0320 04:04:14.456631 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:04:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:04:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:04:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:04:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:04:16.472464 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:04:22.499938 543705 disk_info.go:125] begin check local disk info of client
I0320 04:04:22.502448 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:04:22.502454 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab000 0xc0001ab040]
E0320 04:04:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:04:23.409763 543705 memory.go:184] no items to output this cycle
I0320 04:04:23.409774 543705 cpu.go:275] no items to output this cycle
E0320 04:04:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:04:33.409777 543705 memory.go:184] no items to output this cycle
I0320 04:04:33.409786 543705 cpu.go:275] no items to output this cycle
E0320 04:04:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:04:43.409869 543705 memory.go:191] Add success.
I0320 04:04:43.409919 543705 cpu.go:282] Add success.
I0320 04:04:43.419708 543705 net.go:648] Add success.
I0320 04:04:43.422626 543705 net.go:770] primary dev: ETH0
I0320 04:04:43.422643 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:04:43.422657 543705 net.go:698] Add success.
I0320 04:04:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:04:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:04:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:04:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:04:53.409792 543705 memory.go:184] no items to output this cycle
I0320 04:04:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 04:05:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:05:03.409793 543705 memory.go:184] no items to output this cycle
I0320 04:05:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 04:05:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:05:13.409797 543705 memory.go:191] Add success.
I0320 04:05:13.409797 543705 cpu.go:282] Add success.
W0320 04:05:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:05:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:05:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:05:13.420171 543705 net.go:648] Add success.
I0320 04:05:13.422756 543705 net.go:770] primary dev: ETH0
I0320 04:05:13.422769 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:05:13.422783 543705 net.go:698] Add success.
I0320 04:05:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:05:14.455130 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:05:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0320 04:05:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:05:14.456573 543705 disk_worker.go:494] system disk:vda1
I0320 04:05:14.456601 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:05:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:05:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:05:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:05:16.458078 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:05:16.472489 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:05:22.503202 543705 disk_info.go:125] begin check local disk info of client
I0320 04:05:22.505875 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:05:22.505881 543705 disk_info.go:196] parse disk info done, disk is : [0xc00029bbc0 0xc00029bc00]
E0320 04:05:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:05:23.409772 543705 memory.go:184] no items to output this cycle
I0320 04:05:23.409781 543705 cpu.go:275] no items to output this cycle
E0320 04:05:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:05:33.409777 543705 memory.go:184] no items to output this cycle
I0320 04:05:33.409787 543705 cpu.go:275] no items to output this cycle
E0320 04:05:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:05:43.409807 543705 memory.go:191] Add success.
I0320 04:05:43.409815 543705 cpu.go:282] Add success.
I0320 04:05:43.420118 543705 net.go:648] Add success.
I0320 04:05:43.422838 543705 net.go:770] primary dev: ETH0
I0320 04:05:43.422852 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:05:43.422864 543705 net.go:698] Add success.
I0320 04:05:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:05:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:05:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:05:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:05:53.409795 543705 memory.go:184] no items to output this cycle
I0320 04:05:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 04:06:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:06:03.409810 543705 memory.go:184] no items to output this cycle
I0320 04:06:03.409822 543705 cpu.go:275] no items to output this cycle
E0320 04:06:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:06:13.409791 543705 memory.go:191] Add success.
I0320 04:06:13.409803 543705 cpu.go:282] Add success.
W0320 04:06:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:06:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:06:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:06:13.420335 543705 net.go:648] Add success.
I0320 04:06:13.422917 543705 net.go:770] primary dev: ETH0
I0320 04:06:13.422932 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:06:13.422946 543705 net.go:698] Add success.
I0320 04:06:13.482877 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"25840856-042d-4156-8681-82a6c384048a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 04:06:13.482910 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 04:06:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:06:14.455142 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:06:14.455221 543705 disk_worker.go:708] disk space is not compliant
W0320 04:06:14.455224 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:06:14.456634 543705 disk_worker.go:494] system disk:vda1
I0320 04:06:14.456664 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:06:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:06:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:06:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:06:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:06:16.472421 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:06:22.505965 543705 disk_info.go:125] begin check local disk info of client
I0320 04:06:22.508528 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:06:22.508535 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007ac80 0xc00007acc0]
E0320 04:06:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:06:23.409763 543705 memory.go:184] no items to output this cycle
I0320 04:06:23.409796 543705 cpu.go:275] no items to output this cycle
E0320 04:06:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:06:33.409803 543705 memory.go:184] no items to output this cycle
I0320 04:06:33.409825 543705 cpu.go:275] no items to output this cycle
I0320 04:06:38.109241 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 04:06:38.109247 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 04:06:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:06:43.410943 543705 memory.go:191] Add success.
I0320 04:06:43.409792 543705 cpu.go:282] Add success.
I0320 04:06:43.419736 543705 net.go:648] Add success.
I0320 04:06:43.422460 543705 net.go:770] primary dev: ETH0
I0320 04:06:43.422474 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:06:43.422486 543705 net.go:698] Add success.
I0320 04:06:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:06:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:06:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:06:53.410651 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:06:53.410666 543705 memory.go:184] no items to output this cycle
I0320 04:06:53.410672 543705 cpu.go:275] no items to output this cycle
E0320 04:07:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:07:03.409784 543705 memory.go:184] no items to output this cycle
I0320 04:07:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 04:07:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:07:13.409777 543705 memory.go:191] Add success.
W0320 04:07:13.409802 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 04:07:13.409806 543705 cpu.go:282] Add success.
W0320 04:07:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:07:13.409815 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:07:13.420295 543705 net.go:648] Add success.
I0320 04:07:13.422858 543705 net.go:770] primary dev: ETH0
I0320 04:07:13.422871 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:07:13.422882 543705 net.go:698] Add success.
I0320 04:07:13.453617 543705 event_worker.go:152] Polling the log file for events...
W0320 04:07:14.455093 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:07:14.455152 543705 disk_worker.go:708] disk space is not compliant
W0320 04:07:14.455155 543705 disk_worker.go:728] disk inode is not compliant
E0320 04:07:14.456923 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 04:07:14.456932 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 04:07:14.456938 543705 custom_config.go:64] query custom config with name: gpu
I0320 04:07:14.456975 543705 disk_worker.go:494] system disk:vda1
I0320 04:07:14.457016 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 04:07:15.456790 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 04:07:15.456798 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:07:16.458098 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 04:07:16.458122 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 04:07:16.458160 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:07:16.458187 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:07:16.472552 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:07:22.509230 543705 disk_info.go:125] begin check local disk info of client
I0320 04:07:22.511716 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:07:22.511722 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002923c0 0xc000292400]
E0320 04:07:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:07:23.409784 543705 memory.go:184] no items to output this cycle
I0320 04:07:23.409798 543705 cpu.go:275] no items to output this cycle
E0320 04:07:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:07:33.409771 543705 memory.go:184] no items to output this cycle
I0320 04:07:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 04:07:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:07:43.409815 543705 memory.go:191] Add success.
I0320 04:07:43.409822 543705 cpu.go:282] Add success.
I0320 04:07:43.419905 543705 net.go:648] Add success.
I0320 04:07:43.422609 543705 net.go:770] primary dev: ETH0
I0320 04:07:43.422622 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:07:43.422634 543705 net.go:698] Add success.
I0320 04:07:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:07:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:07:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:07:53.410229 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:07:53.410254 543705 memory.go:184] no items to output this cycle
I0320 04:07:53.410278 543705 cpu.go:275] no items to output this cycle
E0320 04:08:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:08:03.409783 543705 memory.go:184] no items to output this cycle
I0320 04:08:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 04:08:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:08:13.409795 543705 memory.go:191] Add success.
I0320 04:08:13.409798 543705 cpu.go:282] Add success.
W0320 04:08:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:08:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:08:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:08:13.420119 543705 net.go:648] Add success.
I0320 04:08:13.422905 543705 net.go:770] primary dev: ETH0
I0320 04:08:13.422918 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:08:13.422930 543705 net.go:698] Add success.
I0320 04:08:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:08:14.455132 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:08:14.455230 543705 disk_worker.go:708] disk space is not compliant
W0320 04:08:14.455234 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:08:14.456648 543705 disk_worker.go:494] system disk:vda1
I0320 04:08:14.456678 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:08:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:08:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:08:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:08:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:08:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:08:22.512204 543705 disk_info.go:125] begin check local disk info of client
I0320 04:08:22.514720 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:08:22.514726 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4ec0 0xc0000c4f00]
E0320 04:08:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:08:23.409791 543705 memory.go:184] no items to output this cycle
I0320 04:08:23.409804 543705 cpu.go:275] no items to output this cycle
E0320 04:08:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:08:33.409786 543705 memory.go:184] no items to output this cycle
I0320 04:08:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 04:08:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:08:43.409794 543705 memory.go:191] Add success.
I0320 04:08:43.409801 543705 cpu.go:282] Add success.
I0320 04:08:43.420015 543705 net.go:648] Add success.
I0320 04:08:43.422771 543705 net.go:770] primary dev: ETH0
I0320 04:08:43.422784 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:08:43.422796 543705 net.go:698] Add success.
I0320 04:08:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:08:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:08:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:08:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:08:53.409768 543705 memory.go:184] no items to output this cycle
I0320 04:08:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 04:09:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:09:03.409802 543705 memory.go:184] no items to output this cycle
I0320 04:09:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 04:09:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:09:13.409812 543705 memory.go:191] Add success.
I0320 04:09:13.409825 543705 cpu.go:282] Add success.
W0320 04:09:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:09:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:09:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:09:13.420144 543705 net.go:648] Add success.
I0320 04:09:13.423265 543705 net.go:770] primary dev: ETH0
I0320 04:09:13.423279 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:09:13.423292 543705 net.go:698] Add success.
I0320 04:09:13.468119 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0e3a9009-3e34-4ce2-8153-02756b06f7b6","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 04:09:13.468155 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 04:09:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:09:14.455191 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:09:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 04:09:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:09:14.456642 543705 disk_worker.go:494] system disk:vda1
I0320 04:09:14.456673 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:09:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:09:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:09:16.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:09:16.458086 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:09:16.472609 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:09:22.515217 543705 disk_info.go:125] begin check local disk info of client
I0320 04:09:22.517801 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:09:22.517807 543705 disk_info.go:196] parse disk info done, disk is : [0xc000330140 0xc000330180]
E0320 04:09:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:09:23.409790 543705 memory.go:184] no items to output this cycle
I0320 04:09:23.409806 543705 cpu.go:275] no items to output this cycle
E0320 04:09:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:09:33.409788 543705 memory.go:184] no items to output this cycle
I0320 04:09:33.409790 543705 cpu.go:275] no items to output this cycle
I0320 04:09:38.109731 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 04:09:38.109737 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 04:09:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:09:43.409797 543705 cpu.go:282] Add success.
I0320 04:09:43.410819 543705 memory.go:191] Add success.
I0320 04:09:43.419714 543705 net.go:648] Add success.
I0320 04:09:43.422396 543705 net.go:770] primary dev: ETH0
I0320 04:09:43.422409 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:09:43.422421 543705 net.go:698] Add success.
I0320 04:09:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:09:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:09:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:09:53.410280 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:09:53.410300 543705 memory.go:184] no items to output this cycle
I0320 04:09:53.410303 543705 cpu.go:275] no items to output this cycle
E0320 04:10:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:10:03.409793 543705 cpu.go:275] no items to output this cycle
I0320 04:10:03.409795 543705 memory.go:184] no items to output this cycle
E0320 04:10:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:10:13.409794 543705 cpu.go:282] Add success.
I0320 04:10:13.409799 543705 memory.go:191] Add success.
W0320 04:10:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:10:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:10:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:10:13.420185 543705 net.go:648] Add success.
I0320 04:10:13.423091 543705 net.go:770] primary dev: ETH0
I0320 04:10:13.423108 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:10:13.423120 543705 net.go:698] Add success.
I0320 04:10:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:10:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:10:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0320 04:10:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:10:14.456540 543705 disk_worker.go:494] system disk:vda1
I0320 04:10:14.456589 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:10:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:10:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:10:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:10:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:10:16.472397 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:10:22.518232 543705 disk_info.go:125] begin check local disk info of client
I0320 04:10:22.520668 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:10:22.520675 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002471c0 0xc000247200]
E0320 04:10:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:10:23.409788 543705 memory.go:184] no items to output this cycle
I0320 04:10:23.409801 543705 cpu.go:275] no items to output this cycle
E0320 04:10:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:10:33.409783 543705 memory.go:184] no items to output this cycle
I0320 04:10:33.409783 543705 cpu.go:275] no items to output this cycle
E0320 04:10:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:10:43.409821 543705 memory.go:191] Add success.
I0320 04:10:43.409823 543705 cpu.go:282] Add success.
I0320 04:10:43.419738 543705 net.go:648] Add success.
I0320 04:10:43.422353 543705 net.go:770] primary dev: ETH0
I0320 04:10:43.422367 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:10:43.422379 543705 net.go:698] Add success.
I0320 04:10:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:10:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:10:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:10:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:10:53.409779 543705 memory.go:184] no items to output this cycle
I0320 04:10:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 04:11:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:11:03.409784 543705 memory.go:184] no items to output this cycle
I0320 04:11:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 04:11:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:11:13.409788 543705 memory.go:191] Add success.
I0320 04:11:13.409794 543705 cpu.go:282] Add success.
W0320 04:11:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:11:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:11:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:11:13.420223 543705 net.go:648] Add success.
I0320 04:11:13.422967 543705 net.go:770] primary dev: ETH0
I0320 04:11:13.422980 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:11:13.422992 543705 net.go:698] Add success.
I0320 04:11:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:11:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:11:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0320 04:11:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:11:14.456602 543705 disk_worker.go:494] system disk:vda1
I0320 04:11:14.456631 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:11:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:11:16.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:11:16.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:11:16.458084 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:11:16.472525 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:11:22.521249 543705 disk_info.go:125] begin check local disk info of client
I0320 04:11:22.523872 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:11:22.523879 543705 disk_info.go:196] parse disk info done, disk is : [0xc000266c80 0xc000266cc0]
E0320 04:11:23.409739 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:11:23.409754 543705 memory.go:184] no items to output this cycle
I0320 04:11:23.409800 543705 cpu.go:275] no items to output this cycle
E0320 04:11:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:11:33.409804 543705 memory.go:184] no items to output this cycle
I0320 04:11:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 04:11:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:11:43.409794 543705 memory.go:191] Add success.
I0320 04:11:43.409798 543705 cpu.go:282] Add success.
I0320 04:11:43.419706 543705 net.go:648] Add success.
I0320 04:11:43.422174 543705 net.go:770] primary dev: ETH0
I0320 04:11:43.422188 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:11:43.422200 543705 net.go:698] Add success.
I0320 04:11:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:11:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:11:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:11:53.410247 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:11:53.410262 543705 memory.go:184] no items to output this cycle
I0320 04:11:53.410268 543705 cpu.go:275] no items to output this cycle
E0320 04:12:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:12:03.409783 543705 memory.go:184] no items to output this cycle
I0320 04:12:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 04:12:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:12:13.409789 543705 memory.go:191] Add success.
I0320 04:12:13.409791 543705 cpu.go:282] Add success.
W0320 04:12:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:12:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:12:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:12:13.420037 543705 net.go:648] Add success.
I0320 04:12:13.422505 543705 net.go:770] primary dev: ETH0
I0320 04:12:13.422521 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:12:13.422535 543705 net.go:698] Add success.
I0320 04:12:13.471212 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"12b4f6a9-c5da-4295-b225-378916482f15","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 04:12:13.471245 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 04:12:14.455124 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:12:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 04:12:14.455192 543705 disk_worker.go:728] disk inode is not compliant
E0320 04:12:14.455889 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 04:12:14.455898 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 04:12:14.455904 543705 custom_config.go:64] query custom config with name: gpu
I0320 04:12:14.456641 543705 disk_worker.go:494] system disk:vda1
I0320 04:12:14.456683 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 04:12:15.456801 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 04:12:15.456808 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:12:16.457944 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 04:12:16.457944 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 04:12:16.457999 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:12:16.458019 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:12:16.472331 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:12:22.523964 543705 disk_info.go:125] begin check local disk info of client
I0320 04:12:22.526502 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:12:22.526509 543705 disk_info.go:196] parse disk info done, disk is : [0xc00053fd00 0xc00053fd40]
E0320 04:12:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:12:23.409759 543705 memory.go:184] no items to output this cycle
I0320 04:12:23.409794 543705 cpu.go:275] no items to output this cycle
E0320 04:12:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:12:33.409802 543705 memory.go:184] no items to output this cycle
I0320 04:12:33.409812 543705 cpu.go:275] no items to output this cycle
I0320 04:12:38.109876 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 04:12:38.109885 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 04:12:43.409883 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:12:43.410988 543705 memory.go:191] Add success.
I0320 04:12:43.409920 543705 cpu.go:282] Add success.
I0320 04:12:43.419716 543705 net.go:648] Add success.
I0320 04:12:43.422475 543705 net.go:770] primary dev: ETH0
I0320 04:12:43.422488 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:12:43.422501 543705 net.go:698] Add success.
I0320 04:12:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:12:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:12:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:12:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:12:53.409783 543705 cpu.go:275] no items to output this cycle
I0320 04:12:53.409797 543705 memory.go:184] no items to output this cycle
E0320 04:13:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:13:03.409780 543705 memory.go:184] no items to output this cycle
I0320 04:13:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 04:13:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:13:13.409784 543705 memory.go:191] Add success.
I0320 04:13:13.409802 543705 cpu.go:282] Add success.
W0320 04:13:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:13:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:13:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:13:13.420170 543705 net.go:648] Add success.
I0320 04:13:13.422784 543705 net.go:770] primary dev: ETH0
I0320 04:13:13.422799 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:13:13.422813 543705 net.go:698] Add success.
I0320 04:13:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:13:14.455162 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:13:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0320 04:13:14.455176 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:13:14.456498 543705 disk_worker.go:494] system disk:vda1
I0320 04:13:14.456543 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:13:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:13:16.458041 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:13:16.458126 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:13:16.458164 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:13:16.472772 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:13:22.527325 543705 disk_info.go:125] begin check local disk info of client
I0320 04:13:22.529947 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:13:22.529954 543705 disk_info.go:196] parse disk info done, disk is : [0xc00053eac0 0xc00053eb00]
E0320 04:13:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:13:23.409774 543705 memory.go:184] no items to output this cycle
I0320 04:13:23.409776 543705 cpu.go:275] no items to output this cycle
E0320 04:13:33.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:13:33.409811 543705 memory.go:184] no items to output this cycle
I0320 04:13:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 04:13:43.409863 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:13:43.409893 543705 memory.go:191] Add success.
I0320 04:13:43.409969 543705 cpu.go:282] Add success.
I0320 04:13:43.419717 543705 net.go:648] Add success.
I0320 04:13:43.422113 543705 net.go:770] primary dev: ETH0
I0320 04:13:43.422128 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:13:43.422143 543705 net.go:698] Add success.
I0320 04:13:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:13:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:13:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:13:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:13:53.409775 543705 memory.go:184] no items to output this cycle
I0320 04:13:53.409787 543705 cpu.go:275] no items to output this cycle
I0320 04:14:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 04:14:03.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:14:03.409814 543705 memory.go:184] no items to output this cycle
E0320 04:14:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:14:13.409810 543705 memory.go:191] Add success.
I0320 04:14:13.409819 543705 cpu.go:282] Add success.
W0320 04:14:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:14:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:14:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:14:13.420125 543705 net.go:648] Add success.
I0320 04:14:13.422862 543705 net.go:770] primary dev: ETH0
I0320 04:14:13.422877 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:14:13.422892 543705 net.go:698] Add success.
I0320 04:14:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:14:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:14:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0320 04:14:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:14:14.456531 543705 disk_worker.go:494] system disk:vda1
I0320 04:14:14.456577 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:14:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:14:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:14:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:14:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:14:16.472407 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:14:22.530286 543705 disk_info.go:125] begin check local disk info of client
I0320 04:14:22.532774 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:14:22.532780 543705 disk_info.go:196] parse disk info done, disk is : [0xc000253440 0xc000253480]
E0320 04:14:23.409735 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:14:23.409749 543705 memory.go:184] no items to output this cycle
I0320 04:14:23.409791 543705 cpu.go:275] no items to output this cycle
E0320 04:14:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:14:33.409770 543705 memory.go:184] no items to output this cycle
I0320 04:14:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 04:14:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:14:43.409811 543705 memory.go:191] Add success.
I0320 04:14:43.409817 543705 cpu.go:282] Add success.
I0320 04:14:43.419998 543705 net.go:648] Add success.
I0320 04:14:43.423196 543705 net.go:770] primary dev: ETH0
I0320 04:14:43.423209 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:14:43.423220 543705 net.go:698] Add success.
I0320 04:14:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:14:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:14:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:14:53.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:14:53.409809 543705 memory.go:184] no items to output this cycle
I0320 04:14:53.409814 543705 cpu.go:275] no items to output this cycle
E0320 04:15:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:15:03.409782 543705 memory.go:184] no items to output this cycle
I0320 04:15:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 04:15:13.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:15:13.409777 543705 memory.go:191] Add success.
W0320 04:15:13.409803 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:15:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:15:13.409814 543705 cpu.go:282] Add success.
I0320 04:15:13.409818 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:15:13.420103 543705 net.go:648] Add success.
I0320 04:15:13.422560 543705 net.go:770] primary dev: ETH0
I0320 04:15:13.422573 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:15:13.422585 543705 net.go:698] Add success.
I0320 04:15:13.463902 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c739081b-d6fa-45c5-8904-12eafc106887","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 04:15:13.463934 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 04:15:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:15:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:15:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 04:15:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:15:14.456684 543705 disk_worker.go:494] system disk:vda1
I0320 04:15:14.456726 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:15:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:15:16.458001 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:15:16.458068 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:15:16.458096 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:15:16.472515 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:15:22.533316 543705 disk_info.go:125] begin check local disk info of client
I0320 04:15:22.535872 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:15:22.535878 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000ed440 0xc0000ed480]
E0320 04:15:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:15:23.409792 543705 memory.go:184] no items to output this cycle
I0320 04:15:23.409806 543705 cpu.go:275] no items to output this cycle
E0320 04:15:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:15:33.409811 543705 memory.go:184] no items to output this cycle
I0320 04:15:33.409822 543705 cpu.go:275] no items to output this cycle
I0320 04:15:38.113256 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 04:15:38.113262 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 04:15:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:15:43.410691 543705 memory.go:191] Add success.
I0320 04:15:43.409796 543705 cpu.go:282] Add success.
I0320 04:15:43.420574 543705 net.go:648] Add success.
I0320 04:15:43.422935 543705 net.go:770] primary dev: ETH0
I0320 04:15:43.422947 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:15:43.422960 543705 net.go:698] Add success.
I0320 04:15:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:15:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:15:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:15:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:15:53.409789 543705 memory.go:184] no items to output this cycle
I0320 04:15:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 04:16:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:16:03.409785 543705 memory.go:184] no items to output this cycle
I0320 04:16:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 04:16:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:16:13.409795 543705 memory.go:191] Add success.
I0320 04:16:13.409797 543705 cpu.go:282] Add success.
W0320 04:16:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:16:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:16:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:16:13.420248 543705 net.go:648] Add success.
I0320 04:16:13.423172 543705 net.go:770] primary dev: ETH0
I0320 04:16:13.423188 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:16:13.423201 543705 net.go:698] Add success.
I0320 04:16:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:16:14.455103 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:16:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0320 04:16:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:16:14.456506 543705 disk_worker.go:494] system disk:vda1
I0320 04:16:14.456561 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:16:15.455950 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:16:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:16:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:16:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:16:16.472371 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:16:22.536318 543705 disk_info.go:125] begin check local disk info of client
I0320 04:16:22.538820 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:16:22.538827 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ee100 0xc0003ee140]
E0320 04:16:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:16:23.409799 543705 memory.go:184] no items to output this cycle
I0320 04:16:23.409810 543705 cpu.go:275] no items to output this cycle
E0320 04:16:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:16:33.409776 543705 memory.go:184] no items to output this cycle
I0320 04:16:33.409809 543705 cpu.go:275] no items to output this cycle
E0320 04:16:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:16:43.409809 543705 memory.go:191] Add success.
I0320 04:16:43.409818 543705 cpu.go:282] Add success.
I0320 04:16:43.419903 543705 net.go:648] Add success.
I0320 04:16:43.422506 543705 net.go:770] primary dev: ETH0
I0320 04:16:43.422521 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:16:43.422536 543705 net.go:698] Add success.
I0320 04:16:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:16:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:16:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:16:53.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:16:53.409809 543705 cpu.go:275] no items to output this cycle
I0320 04:16:53.409818 543705 memory.go:184] no items to output this cycle
E0320 04:17:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:17:03.409772 543705 memory.go:184] no items to output this cycle
I0320 04:17:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 04:17:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:17:13.409810 543705 memory.go:191] Add success.
I0320 04:17:13.409815 543705 cpu.go:282] Add success.
W0320 04:17:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:17:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:17:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:17:13.420066 543705 net.go:648] Add success.
I0320 04:17:13.422588 543705 net.go:770] primary dev: ETH0
I0320 04:17:13.422603 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:17:13.422617 543705 net.go:698] Add success.
I0320 04:17:13.453157 543705 event_worker.go:152] Polling the log file for events...
W0320 04:17:14.455151 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:17:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0320 04:17:14.455164 543705 disk_worker.go:728] disk inode is not compliant
E0320 04:17:14.456403 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 04:17:14.456414 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 04:17:14.456420 543705 custom_config.go:64] query custom config with name: gpu
I0320 04:17:14.457453 543705 disk_worker.go:494] system disk:vda1
I0320 04:17:14.457485 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 04:17:15.456843 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 04:17:15.456853 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:17:16.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 04:17:16.457988 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 04:17:16.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:17:16.458083 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:17:16.472582 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:17:22.538917 543705 disk_info.go:125] begin check local disk info of client
I0320 04:17:22.541443 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:17:22.541450 543705 disk_info.go:196] parse disk info done, disk is : [0xc000484100 0xc000484140]
E0320 04:17:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:17:23.409785 543705 memory.go:184] no items to output this cycle
I0320 04:17:23.409798 543705 cpu.go:275] no items to output this cycle
E0320 04:17:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:17:33.409765 543705 memory.go:184] no items to output this cycle
I0320 04:17:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 04:17:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:17:43.409808 543705 memory.go:191] Add success.
I0320 04:17:43.409816 543705 cpu.go:282] Add success.
I0320 04:17:43.419861 543705 net.go:648] Add success.
I0320 04:17:43.422484 543705 net.go:770] primary dev: ETH0
I0320 04:17:43.422498 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:17:43.422510 543705 net.go:698] Add success.
I0320 04:17:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:17:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:17:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:17:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:17:53.409801 543705 memory.go:184] no items to output this cycle
I0320 04:17:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 04:18:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:18:03.409799 543705 cpu.go:275] no items to output this cycle
I0320 04:18:03.409801 543705 memory.go:184] no items to output this cycle
E0320 04:18:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:18:13.409794 543705 memory.go:191] Add success.
I0320 04:18:13.409799 543705 cpu.go:282] Add success.
W0320 04:18:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:18:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:18:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:18:13.420261 543705 net.go:648] Add success.
I0320 04:18:13.422818 543705 net.go:770] primary dev: ETH0
I0320 04:18:13.422830 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:18:13.422842 543705 net.go:698] Add success.
I0320 04:18:13.467823 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"834f8f0f-6551-44c4-a10d-06ec40371728","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 04:18:13.467853 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 04:18:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:18:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:18:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 04:18:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:18:14.456586 543705 disk_worker.go:494] system disk:vda1
I0320 04:18:14.456630 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:18:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:18:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:18:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:18:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:18:16.472388 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:18:22.542409 543705 disk_info.go:125] begin check local disk info of client
I0320 04:18:22.544860 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:18:22.544866 543705 disk_info.go:196] parse disk info done, disk is : [0xc000384500 0xc000384540]
E0320 04:18:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:18:23.409784 543705 memory.go:184] no items to output this cycle
I0320 04:18:23.409799 543705 cpu.go:275] no items to output this cycle
E0320 04:18:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:18:33.409782 543705 memory.go:184] no items to output this cycle
I0320 04:18:33.409785 543705 cpu.go:275] no items to output this cycle
I0320 04:18:38.113730 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 04:18:38.113736 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 04:18:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:18:43.410668 543705 memory.go:191] Add success.
I0320 04:18:43.409810 543705 cpu.go:282] Add success.
I0320 04:18:43.420445 543705 net.go:648] Add success.
I0320 04:18:43.423300 543705 net.go:770] primary dev: ETH0
I0320 04:18:43.423315 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:18:43.423330 543705 net.go:698] Add success.
I0320 04:18:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:18:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:18:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:18:53.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:18:53.409813 543705 memory.go:184] no items to output this cycle
I0320 04:18:53.409818 543705 cpu.go:275] no items to output this cycle
E0320 04:19:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:19:03.409799 543705 memory.go:184] no items to output this cycle
I0320 04:19:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 04:19:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:19:13.409789 543705 memory.go:191] Add success.
I0320 04:19:13.409811 543705 cpu.go:282] Add success.
W0320 04:19:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:19:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:19:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:19:13.420153 543705 net.go:648] Add success.
I0320 04:19:13.423140 543705 net.go:770] primary dev: ETH0
I0320 04:19:13.423155 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:19:13.423169 543705 net.go:698] Add success.
I0320 04:19:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:19:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:19:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 04:19:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:19:14.456501 543705 disk_worker.go:494] system disk:vda1
I0320 04:19:14.456546 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:19:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:19:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:19:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:19:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:19:16.472503 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:19:22.545371 543705 disk_info.go:125] begin check local disk info of client
I0320 04:19:22.547922 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:19:22.547928 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007ab40 0xc00007ab80]
E0320 04:19:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:19:23.409795 543705 memory.go:184] no items to output this cycle
I0320 04:19:23.409810 543705 cpu.go:275] no items to output this cycle
E0320 04:19:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:19:33.409801 543705 memory.go:184] no items to output this cycle
I0320 04:19:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 04:19:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:19:43.409788 543705 memory.go:191] Add success.
I0320 04:19:43.409799 543705 cpu.go:282] Add success.
I0320 04:19:43.419960 543705 net.go:648] Add success.
I0320 04:19:43.422549 543705 net.go:770] primary dev: ETH0
I0320 04:19:43.422563 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:19:43.422574 543705 net.go:698] Add success.
I0320 04:19:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:19:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:19:46.458097 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:19:53.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:19:53.409808 543705 memory.go:184] no items to output this cycle
I0320 04:19:53.409818 543705 cpu.go:275] no items to output this cycle
E0320 04:20:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:20:03.409791 543705 memory.go:184] no items to output this cycle
I0320 04:20:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 04:20:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:20:13.409786 543705 memory.go:191] Add success.
I0320 04:20:13.409788 543705 cpu.go:282] Add success.
W0320 04:20:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:20:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:20:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:20:13.420037 543705 net.go:648] Add success.
I0320 04:20:13.422535 543705 net.go:770] primary dev: ETH0
I0320 04:20:13.422547 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:20:13.422559 543705 net.go:698] Add success.
I0320 04:20:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:20:14.455134 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:20:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0320 04:20:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:20:14.456604 543705 disk_worker.go:494] system disk:vda1
I0320 04:20:14.456635 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:20:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:20:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:20:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:20:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:20:16.472418 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:20:22.548015 543705 disk_info.go:125] begin check local disk info of client
I0320 04:20:22.550473 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:20:22.550479 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034bc00 0xc00034bc40]
E0320 04:20:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:20:23.409783 543705 memory.go:184] no items to output this cycle
I0320 04:20:23.409795 543705 cpu.go:275] no items to output this cycle
E0320 04:20:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:20:33.409781 543705 memory.go:184] no items to output this cycle
I0320 04:20:33.409786 543705 cpu.go:275] no items to output this cycle
E0320 04:20:43.409862 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:20:43.409897 543705 memory.go:191] Add success.
I0320 04:20:43.410116 543705 cpu.go:282] Add success.
I0320 04:20:43.419718 543705 net.go:648] Add success.
I0320 04:20:43.422333 543705 net.go:770] primary dev: ETH0
I0320 04:20:43.422346 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:20:43.422357 543705 net.go:698] Add success.
I0320 04:20:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:20:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:20:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:20:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:20:53.409776 543705 cpu.go:275] no items to output this cycle
I0320 04:20:53.409786 543705 memory.go:184] no items to output this cycle
E0320 04:21:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:21:03.409804 543705 memory.go:184] no items to output this cycle
I0320 04:21:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 04:21:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:21:13.409809 543705 memory.go:191] Add success.
I0320 04:21:13.409819 543705 cpu.go:282] Add success.
W0320 04:21:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:21:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:21:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:21:13.420099 543705 net.go:648] Add success.
I0320 04:21:13.423133 543705 net.go:770] primary dev: ETH0
I0320 04:21:13.423145 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:21:13.423157 543705 net.go:698] Add success.
I0320 04:21:13.473262 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8a8d9d78-f3eb-4b18-9446-2ee8452ce807","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 04:21:13.473297 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 04:21:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:21:14.455126 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:21:14.455221 543705 disk_worker.go:708] disk space is not compliant
W0320 04:21:14.455224 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:21:14.456614 543705 disk_worker.go:494] system disk:vda1
I0320 04:21:14.456644 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:21:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:21:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:21:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:21:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:21:16.472393 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:21:22.551453 543705 disk_info.go:125] begin check local disk info of client
I0320 04:21:22.554023 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:21:22.554029 543705 disk_info.go:196] parse disk info done, disk is : [0xc000385300 0xc000385340]
E0320 04:21:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:21:23.409891 543705 memory.go:184] no items to output this cycle
I0320 04:21:23.409928 543705 cpu.go:275] no items to output this cycle
E0320 04:21:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:21:33.409782 543705 memory.go:184] no items to output this cycle
I0320 04:21:33.409791 543705 cpu.go:275] no items to output this cycle
I0320 04:21:38.117286 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 04:21:38.117292 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 04:21:43.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:21:43.410625 543705 memory.go:191] Add success.
I0320 04:21:43.409801 543705 cpu.go:282] Add success.
I0320 04:21:43.420328 543705 net.go:648] Add success.
I0320 04:21:43.422942 543705 net.go:770] primary dev: ETH0
I0320 04:21:43.422957 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:21:43.422972 543705 net.go:698] Add success.
I0320 04:21:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:21:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:21:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:21:53.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:21:53.409809 543705 memory.go:184] no items to output this cycle
I0320 04:21:53.409821 543705 cpu.go:275] no items to output this cycle
E0320 04:22:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:22:03.409784 543705 cpu.go:275] no items to output this cycle
I0320 04:22:03.409788 543705 memory.go:184] no items to output this cycle
E0320 04:22:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:22:13.409807 543705 memory.go:191] Add success.
I0320 04:22:13.409815 543705 cpu.go:282] Add success.
W0320 04:22:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:22:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:22:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:22:13.420116 543705 net.go:648] Add success.
I0320 04:22:13.423679 543705 net.go:770] primary dev: ETH0
I0320 04:22:13.423692 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:22:13.423705 543705 net.go:698] Add success.
W0320 04:22:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:22:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 04:22:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:22:14.456908 543705 disk_worker.go:494] system disk:vda1
I0320 04:22:14.456958 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 04:22:14.457442 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 04:22:14.457449 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 04:22:14.457453 543705 custom_config.go:64] query custom config with name: gpu
E0320 04:22:15.456783 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 04:22:15.456791 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:22:16.457950 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 04:22:16.457952 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 04:22:16.458003 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:22:16.458023 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:22:16.472347 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:22:22.554116 543705 disk_info.go:125] begin check local disk info of client
I0320 04:22:22.556617 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:22:22.556624 543705 disk_info.go:196] parse disk info done, disk is : [0xc000368340 0xc000368380]
E0320 04:22:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:22:23.409773 543705 cpu.go:275] no items to output this cycle
I0320 04:22:23.409784 543705 memory.go:184] no items to output this cycle
E0320 04:22:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:22:33.409770 543705 memory.go:184] no items to output this cycle
I0320 04:22:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 04:22:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:22:43.409821 543705 memory.go:191] Add success.
I0320 04:22:43.409826 543705 cpu.go:282] Add success.
I0320 04:22:43.419953 543705 net.go:648] Add success.
I0320 04:22:43.422574 543705 net.go:770] primary dev: ETH0
I0320 04:22:43.422587 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:22:43.422602 543705 net.go:698] Add success.
I0320 04:22:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:22:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:22:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:22:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:22:53.409776 543705 memory.go:184] no items to output this cycle
I0320 04:22:53.409779 543705 cpu.go:275] no items to output this cycle
E0320 04:23:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:23:03.409790 543705 memory.go:184] no items to output this cycle
I0320 04:23:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 04:23:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:23:13.409811 543705 memory.go:191] Add success.
I0320 04:23:13.409815 543705 cpu.go:282] Add success.
W0320 04:23:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:23:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:23:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:23:13.420056 543705 net.go:648] Add success.
I0320 04:23:13.422852 543705 net.go:770] primary dev: ETH0
I0320 04:23:13.422864 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:23:13.422876 543705 net.go:698] Add success.
I0320 04:23:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:23:14.455153 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:23:14.455164 543705 disk_worker.go:708] disk space is not compliant
W0320 04:23:14.455166 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:23:14.456519 543705 disk_worker.go:494] system disk:vda1
I0320 04:23:14.456566 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:23:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:23:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:23:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:23:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:23:16.472091 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:23:22.557473 543705 disk_info.go:125] begin check local disk info of client
I0320 04:23:22.560018 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:23:22.560025 543705 disk_info.go:196] parse disk info done, disk is : [0xc000516000 0xc000516040]
E0320 04:23:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:23:23.409788 543705 memory.go:184] no items to output this cycle
I0320 04:23:23.409806 543705 cpu.go:275] no items to output this cycle
E0320 04:23:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:23:33.409775 543705 memory.go:184] no items to output this cycle
I0320 04:23:33.409798 543705 cpu.go:275] no items to output this cycle
E0320 04:23:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:23:43.409820 543705 memory.go:191] Add success.
I0320 04:23:43.409834 543705 cpu.go:282] Add success.
I0320 04:23:43.420066 543705 net.go:648] Add success.
I0320 04:23:43.422601 543705 net.go:770] primary dev: ETH0
I0320 04:23:43.422616 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:23:43.422628 543705 net.go:698] Add success.
I0320 04:23:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:23:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:23:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:23:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:23:53.409783 543705 memory.go:184] no items to output this cycle
I0320 04:23:53.409784 543705 cpu.go:275] no items to output this cycle
E0320 04:24:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:24:03.409810 543705 memory.go:184] no items to output this cycle
I0320 04:24:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 04:24:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:24:13.409785 543705 memory.go:191] Add success.
I0320 04:24:13.409802 543705 cpu.go:282] Add success.
W0320 04:24:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:24:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:24:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:24:13.420076 543705 net.go:648] Add success.
I0320 04:24:13.422728 543705 net.go:770] primary dev: ETH0
I0320 04:24:13.422741 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:24:13.422754 543705 net.go:698] Add success.
I0320 04:24:13.709606 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d1be6743-3050-44b2-a41e-cf99ee96e750","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 04:24:13.709638 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 04:24:14.453970 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:24:14.455198 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:24:14.455288 543705 disk_worker.go:708] disk space is not compliant
W0320 04:24:14.455291 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:24:14.456823 543705 disk_worker.go:494] system disk:vda1
I0320 04:24:14.456853 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:24:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:24:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:24:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:24:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:24:16.472374 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:24:22.560441 543705 disk_info.go:125] begin check local disk info of client
I0320 04:24:22.562942 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:24:22.562949 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003962c0 0xc000396300]
E0320 04:24:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:24:23.409887 543705 memory.go:184] no items to output this cycle
I0320 04:24:23.409893 543705 cpu.go:275] no items to output this cycle
E0320 04:24:33.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:24:33.409761 543705 memory.go:184] no items to output this cycle
I0320 04:24:33.409812 543705 cpu.go:275] no items to output this cycle
I0320 04:24:38.117734 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 04:24:38.117741 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 04:24:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:24:43.410928 543705 memory.go:191] Add success.
I0320 04:24:43.409825 543705 cpu.go:282] Add success.
I0320 04:24:43.420641 543705 net.go:648] Add success.
I0320 04:24:43.422940 543705 net.go:770] primary dev: ETH0
I0320 04:24:43.422953 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:24:43.422966 543705 net.go:698] Add success.
I0320 04:24:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:24:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:24:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:24:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:24:53.409780 543705 memory.go:184] no items to output this cycle
I0320 04:24:53.409784 543705 cpu.go:275] no items to output this cycle
E0320 04:25:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:25:03.409775 543705 memory.go:184] no items to output this cycle
I0320 04:25:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 04:25:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:25:13.409806 543705 memory.go:191] Add success.
I0320 04:25:13.409816 543705 cpu.go:282] Add success.
W0320 04:25:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:25:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:25:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:25:13.420158 543705 net.go:648] Add success.
I0320 04:25:13.422654 543705 net.go:770] primary dev: ETH0
I0320 04:25:13.422668 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:25:13.422681 543705 net.go:698] Add success.
I0320 04:25:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:25:14.455105 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:25:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0320 04:25:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:25:14.456496 543705 disk_worker.go:494] system disk:vda1
I0320 04:25:14.456539 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:25:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:25:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:25:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:25:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:25:16.472372 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:25:22.563455 543705 disk_info.go:125] begin check local disk info of client
I0320 04:25:22.566081 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:25:22.566089 543705 disk_info.go:196] parse disk info done, disk is : [0xc00033fd80 0xc00033fdc0]
E0320 04:25:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:25:23.409861 543705 memory.go:184] no items to output this cycle
I0320 04:25:23.409927 543705 cpu.go:275] no items to output this cycle
E0320 04:25:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:25:33.409776 543705 memory.go:184] no items to output this cycle
I0320 04:25:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 04:25:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:25:43.409788 543705 memory.go:191] Add success.
I0320 04:25:43.409807 543705 cpu.go:282] Add success.
I0320 04:25:43.419882 543705 net.go:648] Add success.
I0320 04:25:43.422459 543705 net.go:770] primary dev: ETH0
I0320 04:25:43.422472 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:25:43.422484 543705 net.go:698] Add success.
I0320 04:25:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:25:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:25:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:25:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:25:53.409777 543705 memory.go:184] no items to output this cycle
I0320 04:25:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 04:26:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:26:03.409812 543705 memory.go:184] no items to output this cycle
I0320 04:26:03.409819 543705 cpu.go:275] no items to output this cycle
E0320 04:26:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:26:13.409781 543705 memory.go:191] Add success.
I0320 04:26:13.409803 543705 cpu.go:282] Add success.
W0320 04:26:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:26:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:26:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:26:13.420183 543705 net.go:648] Add success.
I0320 04:26:13.423149 543705 net.go:770] primary dev: ETH0
I0320 04:26:13.423162 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:26:13.423175 543705 net.go:698] Add success.
I0320 04:26:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:26:14.455199 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:26:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0320 04:26:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:26:14.456596 543705 disk_worker.go:494] system disk:vda1
I0320 04:26:14.456628 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:26:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:26:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:26:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:26:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:26:16.472421 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:26:22.566175 543705 disk_info.go:125] begin check local disk info of client
I0320 04:26:22.568615 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:26:22.568622 543705 disk_info.go:196] parse disk info done, disk is : [0xc000270600 0xc000270640]
E0320 04:26:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:26:23.409780 543705 memory.go:184] no items to output this cycle
I0320 04:26:23.409795 543705 cpu.go:275] no items to output this cycle
E0320 04:26:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:26:33.409780 543705 memory.go:184] no items to output this cycle
I0320 04:26:33.409785 543705 cpu.go:275] no items to output this cycle
E0320 04:26:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:26:43.409800 543705 memory.go:191] Add success.
I0320 04:26:43.409819 543705 cpu.go:282] Add success.
I0320 04:26:43.419911 543705 net.go:648] Add success.
I0320 04:26:43.422696 543705 net.go:770] primary dev: ETH0
I0320 04:26:43.422709 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:26:43.422721 543705 net.go:698] Add success.
I0320 04:26:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:26:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:26:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:26:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:26:53.409793 543705 memory.go:184] no items to output this cycle
I0320 04:26:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 04:27:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:27:03.409787 543705 memory.go:184] no items to output this cycle
I0320 04:27:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 04:27:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:27:13.409792 543705 memory.go:191] Add success.
I0320 04:27:13.409795 543705 cpu.go:282] Add success.
W0320 04:27:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:27:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:27:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:27:13.420139 543705 net.go:648] Add success.
I0320 04:27:13.422889 543705 net.go:770] primary dev: ETH0
I0320 04:27:13.422904 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:27:13.422917 543705 net.go:698] Add success.
I0320 04:27:13.428947 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 04:27:13.453134 543705 event_worker.go:152] Polling the log file for events...
I0320 04:27:13.679364 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"98d243d5-f083-4cbe-baa1-7e4a53a23dac","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 04:27:13.679400 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 04:27:14.454861 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:27:14.454924 543705 disk_worker.go:708] disk space is not compliant
W0320 04:27:14.454928 543705 disk_worker.go:728] disk inode is not compliant
E0320 04:27:14.455631 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 04:27:14.455640 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 04:27:14.455645 543705 custom_config.go:64] query custom config with name: gpu
I0320 04:27:14.456523 543705 disk_worker.go:494] system disk:vda1
I0320 04:27:14.456553 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 04:27:15.456791 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 04:27:15.456799 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:27:16.457925 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 04:27:16.457925 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 04:27:16.457981 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:27:16.458002 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:27:16.472376 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:27:22.569532 543705 disk_info.go:125] begin check local disk info of client
I0320 04:27:22.572062 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:27:22.572069 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034eb80 0xc00034ebc0]
E0320 04:27:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:27:23.409786 543705 memory.go:184] no items to output this cycle
I0320 04:27:23.409800 543705 cpu.go:275] no items to output this cycle
E0320 04:27:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:27:33.409777 543705 memory.go:184] no items to output this cycle
I0320 04:27:33.409809 543705 cpu.go:275] no items to output this cycle
I0320 04:27:38.117880 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 04:27:38.117887 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 04:27:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:27:43.410588 543705 memory.go:191] Add success.
I0320 04:27:43.409805 543705 cpu.go:282] Add success.
I0320 04:27:43.420292 543705 net.go:648] Add success.
I0320 04:27:43.422897 543705 net.go:770] primary dev: ETH0
I0320 04:27:43.422910 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:27:43.422923 543705 net.go:698] Add success.
I0320 04:27:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:27:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:27:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:27:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:27:53.409763 543705 memory.go:184] no items to output this cycle
I0320 04:27:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 04:28:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:28:03.409791 543705 memory.go:184] no items to output this cycle
I0320 04:28:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 04:28:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:28:13.409794 543705 memory.go:191] Add success.
I0320 04:28:13.409795 543705 cpu.go:282] Add success.
W0320 04:28:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:28:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:28:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:28:13.420124 543705 net.go:648] Add success.
I0320 04:28:13.422697 543705 net.go:770] primary dev: ETH0
I0320 04:28:13.422711 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:28:13.422726 543705 net.go:698] Add success.
I0320 04:28:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:28:14.455105 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:28:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0320 04:28:14.455173 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:28:14.456494 543705 disk_worker.go:494] system disk:vda1
I0320 04:28:14.456538 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:28:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:28:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:28:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:28:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:28:16.472426 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:28:22.572503 543705 disk_info.go:125] begin check local disk info of client
I0320 04:28:22.574986 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:28:22.574996 543705 disk_info.go:196] parse disk info done, disk is : [0xc000250000 0xc000250040]
E0320 04:28:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:28:23.409758 543705 memory.go:184] no items to output this cycle
I0320 04:28:23.409799 543705 cpu.go:275] no items to output this cycle
E0320 04:28:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:28:33.409772 543705 memory.go:184] no items to output this cycle
I0320 04:28:33.409803 543705 cpu.go:275] no items to output this cycle
E0320 04:28:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:28:43.409805 543705 memory.go:191] Add success.
I0320 04:28:43.409806 543705 cpu.go:282] Add success.
I0320 04:28:43.419977 543705 net.go:648] Add success.
I0320 04:28:43.422643 543705 net.go:770] primary dev: ETH0
I0320 04:28:43.422655 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:28:43.422668 543705 net.go:698] Add success.
I0320 04:28:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:28:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:28:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:28:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:28:53.409797 543705 memory.go:184] no items to output this cycle
I0320 04:28:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 04:29:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:29:03.409788 543705 memory.go:184] no items to output this cycle
I0320 04:29:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 04:29:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:29:13.409797 543705 memory.go:191] Add success.
I0320 04:29:13.409798 543705 cpu.go:282] Add success.
W0320 04:29:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:29:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:29:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:29:13.420157 543705 net.go:648] Add success.
I0320 04:29:13.423003 543705 net.go:770] primary dev: ETH0
I0320 04:29:13.423031 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:29:13.423043 543705 net.go:698] Add success.
I0320 04:29:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:29:14.455197 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:29:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0320 04:29:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:29:14.456578 543705 disk_worker.go:494] system disk:vda1
I0320 04:29:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:29:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:29:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:29:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:29:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:29:16.472383 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:29:22.575512 543705 disk_info.go:125] begin check local disk info of client
I0320 04:29:22.578080 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:29:22.578087 543705 disk_info.go:196] parse disk info done, disk is : [0xc00033a440 0xc00033a480]
E0320 04:29:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:29:23.409788 543705 memory.go:184] no items to output this cycle
I0320 04:29:23.409800 543705 cpu.go:275] no items to output this cycle
E0320 04:29:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:29:33.409806 543705 memory.go:184] no items to output this cycle
I0320 04:29:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 04:29:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:29:43.409790 543705 memory.go:191] Add success.
I0320 04:29:43.409819 543705 cpu.go:282] Add success.
I0320 04:29:43.419999 543705 net.go:648] Add success.
I0320 04:29:43.423045 543705 net.go:770] primary dev: ETH0
I0320 04:29:43.423060 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:29:43.423075 543705 net.go:698] Add success.
I0320 04:29:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:29:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:29:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:29:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:29:53.409786 543705 memory.go:184] no items to output this cycle
I0320 04:29:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 04:30:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:30:03.409784 543705 memory.go:184] no items to output this cycle
I0320 04:30:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 04:30:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:30:13.409827 543705 memory.go:191] Add success.
I0320 04:30:13.409839 543705 cpu.go:282] Add success.
W0320 04:30:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:30:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:30:13.409879 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:30:13.420134 543705 net.go:648] Add success.
I0320 04:30:13.423082 543705 net.go:770] primary dev: ETH0
I0320 04:30:13.423094 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:30:13.423113 543705 net.go:698] Add success.
I0320 04:30:13.508584 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0bb93920-19cc-43bd-8a37-61a3daa90d7f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 04:30:13.508618 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 04:30:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:30:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:30:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0320 04:30:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:30:14.456513 543705 disk_worker.go:494] system disk:vda1
I0320 04:30:14.456556 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:30:15.455977 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:30:16.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:30:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:30:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:30:16.472412 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:30:22.578554 543705 disk_info.go:125] begin check local disk info of client
I0320 04:30:22.581069 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:30:22.581077 543705 disk_info.go:196] parse disk info done, disk is : [0xc000575740 0xc000575780]
E0320 04:30:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:30:23.409801 543705 memory.go:184] no items to output this cycle
I0320 04:30:23.409814 543705 cpu.go:275] no items to output this cycle
E0320 04:30:33.409893 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:30:33.409913 543705 memory.go:184] no items to output this cycle
I0320 04:30:33.409934 543705 cpu.go:275] no items to output this cycle
I0320 04:30:38.118025 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 04:30:38.118033 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 04:30:43.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:30:43.410627 543705 memory.go:191] Add success.
I0320 04:30:43.409841 543705 cpu.go:282] Add success.
I0320 04:30:43.420414 543705 net.go:648] Add success.
I0320 04:30:43.423270 543705 net.go:770] primary dev: ETH0
I0320 04:30:43.423284 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:30:43.423297 543705 net.go:698] Add success.
I0320 04:30:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:30:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:30:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:30:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:30:53.409778 543705 memory.go:184] no items to output this cycle
I0320 04:30:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 04:31:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:31:03.409793 543705 memory.go:184] no items to output this cycle
I0320 04:31:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 04:31:13.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:31:13.409815 543705 memory.go:191] Add success.
I0320 04:31:13.409816 543705 cpu.go:282] Add success.
W0320 04:31:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:31:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:31:13.409860 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:31:13.420144 543705 net.go:648] Add success.
I0320 04:31:13.422678 543705 net.go:770] primary dev: ETH0
I0320 04:31:13.422691 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:31:13.422711 543705 net.go:698] Add success.
I0320 04:31:14.454952 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:31:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:31:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0320 04:31:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:31:14.456541 543705 disk_worker.go:494] system disk:vda1
I0320 04:31:14.456571 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:31:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:31:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:31:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:31:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:31:16.472428 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:31:22.581549 543705 disk_info.go:125] begin check local disk info of client
I0320 04:31:22.584095 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:31:22.584103 543705 disk_info.go:196] parse disk info done, disk is : [0xc000382640 0xc000382680]
E0320 04:31:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:31:23.409801 543705 memory.go:184] no items to output this cycle
I0320 04:31:23.409816 543705 cpu.go:275] no items to output this cycle
E0320 04:31:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:31:33.409907 543705 memory.go:184] no items to output this cycle
I0320 04:31:33.409944 543705 cpu.go:275] no items to output this cycle
E0320 04:31:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:31:43.409813 543705 memory.go:191] Add success.
I0320 04:31:43.409830 543705 cpu.go:282] Add success.
I0320 04:31:43.420015 543705 net.go:648] Add success.
I0320 04:31:43.423008 543705 net.go:770] primary dev: ETH0
I0320 04:31:43.423021 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:31:43.423033 543705 net.go:698] Add success.
I0320 04:31:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:31:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:31:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:31:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:31:53.409779 543705 memory.go:184] no items to output this cycle
I0320 04:31:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 04:32:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:32:03.409789 543705 memory.go:184] no items to output this cycle
I0320 04:32:03.409828 543705 cpu.go:275] no items to output this cycle
E0320 04:32:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:32:13.409796 543705 memory.go:191] Add success.
I0320 04:32:13.409818 543705 cpu.go:282] Add success.
W0320 04:32:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:32:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:32:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:32:13.420145 543705 net.go:648] Add success.
I0320 04:32:13.422732 543705 net.go:770] primary dev: ETH0
I0320 04:32:13.422748 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:32:13.422774 543705 net.go:698] Add success.
W0320 04:32:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:32:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 04:32:14.455188 543705 disk_worker.go:728] disk inode is not compliant
E0320 04:32:14.455904 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 04:32:14.455912 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 04:32:14.455918 543705 custom_config.go:64] query custom config with name: gpu
I0320 04:32:14.456553 543705 disk_worker.go:494] system disk:vda1
I0320 04:32:14.456584 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 04:32:15.456841 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 04:32:15.456850 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:32:16.457948 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 04:32:16.457957 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 04:32:16.458001 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:32:16.458016 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:32:16.472369 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:32:22.584565 543705 disk_info.go:125] begin check local disk info of client
I0320 04:32:22.587032 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:32:22.587039 543705 disk_info.go:196] parse disk info done, disk is : [0xc000307280 0xc0003072c0]
E0320 04:32:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:32:23.409797 543705 memory.go:184] no items to output this cycle
I0320 04:32:23.409814 543705 cpu.go:275] no items to output this cycle
E0320 04:32:33.409810 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:32:33.409819 543705 cpu.go:275] no items to output this cycle
I0320 04:32:33.409833 543705 memory.go:184] no items to output this cycle
E0320 04:32:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:32:43.409805 543705 memory.go:191] Add success.
I0320 04:32:43.409831 543705 cpu.go:282] Add success.
I0320 04:32:43.419998 543705 net.go:648] Add success.
I0320 04:32:43.422487 543705 net.go:770] primary dev: ETH0
I0320 04:32:43.422499 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:32:43.422512 543705 net.go:698] Add success.
I0320 04:32:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:32:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:32:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:32:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:32:53.409810 543705 memory.go:184] no items to output this cycle
I0320 04:32:53.409820 543705 cpu.go:275] no items to output this cycle
E0320 04:33:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:33:03.409789 543705 memory.go:184] no items to output this cycle
I0320 04:33:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 04:33:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:33:13.409786 543705 memory.go:191] Add success.
I0320 04:33:13.409803 543705 cpu.go:282] Add success.
W0320 04:33:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:33:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:33:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:33:13.420220 543705 net.go:648] Add success.
I0320 04:33:13.423046 543705 net.go:770] primary dev: ETH0
I0320 04:33:13.423059 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:33:13.423072 543705 net.go:698] Add success.
I0320 04:33:13.469079 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3df17fd0-f897-4d6b-8ce7-3abd43e1f858","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 04:33:13.469113 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 04:33:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:33:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:33:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 04:33:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:33:14.456612 543705 disk_worker.go:494] system disk:vda1
I0320 04:33:14.456640 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:33:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:33:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:33:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:33:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:33:16.472398 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:33:22.587125 543705 disk_info.go:125] begin check local disk info of client
I0320 04:33:22.589773 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:33:22.589781 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002ba780 0xc0002ba7c0]
E0320 04:33:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:33:23.409763 543705 memory.go:184] no items to output this cycle
I0320 04:33:23.409801 543705 cpu.go:275] no items to output this cycle
E0320 04:33:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:33:33.409808 543705 memory.go:184] no items to output this cycle
I0320 04:33:33.409823 543705 cpu.go:275] no items to output this cycle
I0320 04:33:38.121291 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 04:33:38.121297 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
I0320 04:33:43.409952 543705 cpu.go:282] Add success.
E0320 04:33:43.409949 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:33:43.410653 543705 memory.go:191] Add success.
I0320 04:33:43.419728 543705 net.go:648] Add success.
I0320 04:33:43.422108 543705 net.go:770] primary dev: ETH0
I0320 04:33:43.422122 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:33:43.422135 543705 net.go:698] Add success.
I0320 04:33:46.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:33:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:33:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:33:53.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:33:53.409808 543705 memory.go:184] no items to output this cycle
I0320 04:33:53.409817 543705 cpu.go:275] no items to output this cycle
E0320 04:34:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:34:03.409789 543705 memory.go:184] no items to output this cycle
I0320 04:34:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 04:34:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:34:13.409783 543705 memory.go:191] Add success.
I0320 04:34:13.409799 543705 cpu.go:282] Add success.
W0320 04:34:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:34:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:34:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:34:13.420101 543705 net.go:648] Add success.
I0320 04:34:13.422795 543705 net.go:770] primary dev: ETH0
I0320 04:34:13.422807 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:34:13.422820 543705 net.go:698] Add success.
I0320 04:34:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:34:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:34:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 04:34:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:34:14.456589 543705 disk_worker.go:494] system disk:vda1
I0320 04:34:14.456632 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:34:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:34:16.458024 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:34:16.458092 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:34:16.458112 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:34:16.472426 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:34:22.590600 543705 disk_info.go:125] begin check local disk info of client
I0320 04:34:22.593026 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:34:22.593032 543705 disk_info.go:196] parse disk info done, disk is : [0xc000328500 0xc000328540]
E0320 04:34:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:34:23.409781 543705 memory.go:184] no items to output this cycle
I0320 04:34:23.409794 543705 cpu.go:275] no items to output this cycle
E0320 04:34:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:34:33.409797 543705 cpu.go:275] no items to output this cycle
I0320 04:34:33.409807 543705 memory.go:184] no items to output this cycle
E0320 04:34:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:34:43.409791 543705 memory.go:191] Add success.
I0320 04:34:43.409795 543705 cpu.go:282] Add success.
I0320 04:34:43.419726 543705 net.go:648] Add success.
I0320 04:34:43.422488 543705 net.go:770] primary dev: ETH0
I0320 04:34:43.422501 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:34:43.422528 543705 net.go:698] Add success.
I0320 04:34:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:34:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:34:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:34:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:34:53.409770 543705 memory.go:184] no items to output this cycle
I0320 04:34:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 04:35:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:35:03.409809 543705 memory.go:184] no items to output this cycle
I0320 04:35:03.409822 543705 cpu.go:275] no items to output this cycle
E0320 04:35:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:35:13.409799 543705 memory.go:191] Add success.
I0320 04:35:13.409801 543705 cpu.go:282] Add success.
W0320 04:35:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:35:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:35:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:35:13.420059 543705 net.go:648] Add success.
I0320 04:35:13.422610 543705 net.go:770] primary dev: ETH0
I0320 04:35:13.422625 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:35:13.422636 543705 net.go:698] Add success.
I0320 04:35:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:35:14.455137 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:35:14.455217 543705 disk_worker.go:708] disk space is not compliant
W0320 04:35:14.455220 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:35:14.456590 543705 disk_worker.go:494] system disk:vda1
I0320 04:35:14.456619 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:35:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:35:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:35:16.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:35:16.458080 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:35:16.472421 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:35:22.593603 543705 disk_info.go:125] begin check local disk info of client
I0320 04:35:22.596200 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:35:22.596206 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329d40 0xc000329d80]
E0320 04:35:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:35:23.409795 543705 memory.go:184] no items to output this cycle
I0320 04:35:23.409809 543705 cpu.go:275] no items to output this cycle
E0320 04:35:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:35:33.409786 543705 memory.go:184] no items to output this cycle
I0320 04:35:33.409804 543705 cpu.go:275] no items to output this cycle
E0320 04:35:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:35:43.409794 543705 memory.go:191] Add success.
I0320 04:35:43.409816 543705 cpu.go:282] Add success.
I0320 04:35:43.420190 543705 net.go:648] Add success.
I0320 04:35:43.423043 543705 net.go:770] primary dev: ETH0
I0320 04:35:43.423056 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:35:43.423067 543705 net.go:698] Add success.
I0320 04:35:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:35:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:35:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:35:53.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:35:53.409810 543705 memory.go:184] no items to output this cycle
I0320 04:35:53.409823 543705 cpu.go:275] no items to output this cycle
E0320 04:36:03.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:36:03.409801 543705 cpu.go:275] no items to output this cycle
I0320 04:36:03.409810 543705 memory.go:184] no items to output this cycle
E0320 04:36:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:36:13.409822 543705 memory.go:191] Add success.
I0320 04:36:13.409835 543705 cpu.go:282] Add success.
W0320 04:36:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:36:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:36:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:36:13.420276 543705 net.go:648] Add success.
I0320 04:36:13.423252 543705 net.go:770] primary dev: ETH0
I0320 04:36:13.423270 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:36:13.423285 543705 net.go:698] Add success.
I0320 04:36:13.468740 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1fd70168-bb47-4c17-bbef-fe9aac4e3ce7","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 04:36:13.468777 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 04:36:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:36:14.455188 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:36:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 04:36:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:36:14.456548 543705 disk_worker.go:494] system disk:vda1
I0320 04:36:14.456580 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:36:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:36:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:36:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:36:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:36:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:36:22.596632 543705 disk_info.go:125] begin check local disk info of client
I0320 04:36:22.599194 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:36:22.599200 543705 disk_info.go:196] parse disk info done, disk is : [0xc00028a540 0xc00028a580]
E0320 04:36:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:36:23.409776 543705 memory.go:184] no items to output this cycle
I0320 04:36:23.409793 543705 cpu.go:275] no items to output this cycle
E0320 04:36:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:36:33.409803 543705 cpu.go:275] no items to output this cycle
I0320 04:36:33.409811 543705 memory.go:184] no items to output this cycle
I0320 04:36:38.121742 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 04:36:38.121749 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 04:36:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:36:43.410744 543705 memory.go:191] Add success.
I0320 04:36:43.409812 543705 cpu.go:282] Add success.
I0320 04:36:43.420549 543705 net.go:648] Add success.
I0320 04:36:43.423217 543705 net.go:770] primary dev: ETH0
I0320 04:36:43.423230 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:36:43.423242 543705 net.go:698] Add success.
I0320 04:36:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:36:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:36:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:36:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:36:53.409781 543705 memory.go:184] no items to output this cycle
I0320 04:36:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 04:37:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:37:03.409793 543705 memory.go:184] no items to output this cycle
I0320 04:37:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 04:37:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:37:13.409795 543705 memory.go:191] Add success.
I0320 04:37:13.409799 543705 cpu.go:282] Add success.
W0320 04:37:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:37:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:37:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:37:13.420151 543705 net.go:648] Add success.
I0320 04:37:13.422711 543705 net.go:770] primary dev: ETH0
I0320 04:37:13.422723 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:37:13.422736 543705 net.go:698] Add success.
I0320 04:37:13.453335 543705 event_worker.go:152] Polling the log file for events...
W0320 04:37:14.455164 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:37:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0320 04:37:14.455178 543705 disk_worker.go:728] disk inode is not compliant
E0320 04:37:14.455866 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 04:37:14.455875 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 04:37:14.455881 543705 custom_config.go:64] query custom config with name: gpu
I0320 04:37:14.456556 543705 disk_worker.go:494] system disk:vda1
I0320 04:37:14.456586 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 04:37:15.456850 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 04:37:15.456858 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:37:16.457937 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 04:37:16.457937 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 04:37:16.457991 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:37:16.458010 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:37:16.472335 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:37:22.599287 543705 disk_info.go:125] begin check local disk info of client
I0320 04:37:22.601906 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:37:22.601915 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037cfc0 0xc00037d000]
E0320 04:37:23.409738 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:37:23.409752 543705 memory.go:184] no items to output this cycle
I0320 04:37:23.409794 543705 cpu.go:275] no items to output this cycle
E0320 04:37:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:37:33.409797 543705 memory.go:184] no items to output this cycle
I0320 04:37:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 04:37:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:37:43.409802 543705 memory.go:191] Add success.
I0320 04:37:43.409803 543705 cpu.go:282] Add success.
I0320 04:37:43.420042 543705 net.go:648] Add success.
I0320 04:37:43.422871 543705 net.go:770] primary dev: ETH0
I0320 04:37:43.422884 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:37:43.422897 543705 net.go:698] Add success.
I0320 04:37:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:37:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:37:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:37:53.409910 543705 cpu.go:275] no items to output this cycle
E0320 04:37:53.409932 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:37:53.409947 543705 memory.go:184] no items to output this cycle
E0320 04:38:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:38:03.409784 543705 memory.go:184] no items to output this cycle
I0320 04:38:03.409815 543705 cpu.go:275] no items to output this cycle
E0320 04:38:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:38:13.409821 543705 memory.go:191] Add success.
I0320 04:38:13.409829 543705 cpu.go:282] Add success.
W0320 04:38:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:38:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:38:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:38:13.420110 543705 net.go:648] Add success.
I0320 04:38:13.422675 543705 net.go:770] primary dev: ETH0
I0320 04:38:13.422688 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:38:13.422705 543705 net.go:698] Add success.
I0320 04:38:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:38:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:38:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 04:38:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:38:14.456511 543705 disk_worker.go:494] system disk:vda1
I0320 04:38:14.456556 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:38:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:38:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:38:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:38:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:38:16.472373 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:38:22.602662 543705 disk_info.go:125] begin check local disk info of client
I0320 04:38:22.605121 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:38:22.605128 543705 disk_info.go:196] parse disk info done, disk is : [0xc00057b5c0 0xc00057b600]
E0320 04:38:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:38:23.409790 543705 memory.go:184] no items to output this cycle
I0320 04:38:23.409806 543705 cpu.go:275] no items to output this cycle
E0320 04:38:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:38:33.409805 543705 memory.go:184] no items to output this cycle
I0320 04:38:33.409830 543705 cpu.go:275] no items to output this cycle
E0320 04:38:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:38:43.409806 543705 memory.go:191] Add success.
I0320 04:38:43.409814 543705 cpu.go:282] Add success.
I0320 04:38:43.419950 543705 net.go:648] Add success.
I0320 04:38:43.422640 543705 net.go:770] primary dev: ETH0
I0320 04:38:43.422654 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:38:43.422667 543705 net.go:698] Add success.
I0320 04:38:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:38:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:38:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:38:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:38:53.409802 543705 memory.go:184] no items to output this cycle
I0320 04:38:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 04:39:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:39:03.409786 543705 memory.go:184] no items to output this cycle
I0320 04:39:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 04:39:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:39:13.409811 543705 memory.go:191] Add success.
I0320 04:39:13.409816 543705 cpu.go:282] Add success.
W0320 04:39:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:39:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:39:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:39:13.420137 543705 net.go:648] Add success.
I0320 04:39:13.422806 543705 net.go:770] primary dev: ETH0
I0320 04:39:13.422819 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:39:13.422831 543705 net.go:698] Add success.
I0320 04:39:13.523065 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"94e1a022-da2e-4b80-a746-0495c969490e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 04:39:13.523101 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 04:39:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:39:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:39:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0320 04:39:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:39:14.456680 543705 disk_worker.go:494] system disk:vda1
I0320 04:39:14.456706 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:39:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:39:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:39:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:39:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:39:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:39:22.605675 543705 disk_info.go:125] begin check local disk info of client
I0320 04:39:22.608275 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:39:22.608282 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4f00 0xc0000c4f40]
E0320 04:39:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:39:23.409778 543705 cpu.go:275] no items to output this cycle
I0320 04:39:23.409786 543705 memory.go:184] no items to output this cycle
E0320 04:39:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:39:33.409785 543705 memory.go:184] no items to output this cycle
I0320 04:39:33.409795 543705 cpu.go:275] no items to output this cycle
I0320 04:39:38.125308 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 04:39:38.125314 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 04:39:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:39:43.410794 543705 memory.go:191] Add success.
I0320 04:39:43.409814 543705 cpu.go:282] Add success.
I0320 04:39:43.420553 543705 net.go:648] Add success.
I0320 04:39:43.423485 543705 net.go:770] primary dev: ETH0
I0320 04:39:43.423500 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:39:43.423515 543705 net.go:698] Add success.
I0320 04:39:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:39:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:39:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:39:53.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:39:53.409812 543705 memory.go:184] no items to output this cycle
I0320 04:39:53.409818 543705 cpu.go:275] no items to output this cycle
E0320 04:40:03.409866 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:40:03.409947 543705 memory.go:184] no items to output this cycle
I0320 04:40:03.410073 543705 cpu.go:275] no items to output this cycle
E0320 04:40:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:40:13.409798 543705 memory.go:191] Add success.
I0320 04:40:13.409802 543705 cpu.go:282] Add success.
W0320 04:40:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:40:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:40:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:40:13.419980 543705 net.go:770] primary dev: ETH0
I0320 04:40:13.419991 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:40:13.420003 543705 net.go:698] Add success.
I0320 04:40:13.420372 543705 net.go:648] Add success.
I0320 04:40:14.454994 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:40:14.455180 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:40:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 04:40:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:40:14.456513 543705 disk_worker.go:494] system disk:vda1
I0320 04:40:14.456558 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:40:15.455975 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:40:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:40:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:40:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:40:16.472436 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:40:22.608682 543705 disk_info.go:125] begin check local disk info of client
I0320 04:40:22.611085 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:40:22.611092 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4a00 0xc0000c4a40]
E0320 04:40:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:40:23.409786 543705 memory.go:184] no items to output this cycle
I0320 04:40:23.409799 543705 cpu.go:275] no items to output this cycle
E0320 04:40:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:40:33.409781 543705 memory.go:184] no items to output this cycle
I0320 04:40:33.409789 543705 cpu.go:275] no items to output this cycle
E0320 04:40:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:40:43.409790 543705 memory.go:191] Add success.
I0320 04:40:43.409791 543705 cpu.go:282] Add success.
I0320 04:40:43.419861 543705 net.go:648] Add success.
I0320 04:40:43.422736 543705 net.go:770] primary dev: ETH0
I0320 04:40:43.422751 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:40:43.422765 543705 net.go:698] Add success.
I0320 04:40:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:40:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:40:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:40:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:40:53.409797 543705 memory.go:184] no items to output this cycle
I0320 04:40:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 04:41:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:41:03.409787 543705 memory.go:184] no items to output this cycle
I0320 04:41:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 04:41:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:41:13.409784 543705 memory.go:191] Add success.
I0320 04:41:13.409806 543705 cpu.go:282] Add success.
W0320 04:41:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:41:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:41:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:41:13.420315 543705 net.go:648] Add success.
I0320 04:41:13.422889 543705 net.go:770] primary dev: ETH0
I0320 04:41:13.422904 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:41:13.422918 543705 net.go:698] Add success.
I0320 04:41:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:41:14.455149 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:41:14.455159 543705 disk_worker.go:708] disk space is not compliant
W0320 04:41:14.455162 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:41:14.456509 543705 disk_worker.go:494] system disk:vda1
I0320 04:41:14.456555 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:41:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:41:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:41:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:41:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:41:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:41:22.611701 543705 disk_info.go:125] begin check local disk info of client
I0320 04:41:22.614286 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:41:22.614292 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000ec400 0xc0000ec440]
E0320 04:41:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:41:23.409786 543705 memory.go:184] no items to output this cycle
I0320 04:41:23.409801 543705 cpu.go:275] no items to output this cycle
E0320 04:41:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:41:33.409803 543705 memory.go:184] no items to output this cycle
I0320 04:41:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 04:41:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:41:43.409780 543705 memory.go:191] Add success.
I0320 04:41:43.409802 543705 cpu.go:282] Add success.
I0320 04:41:43.419854 543705 net.go:648] Add success.
I0320 04:41:43.422592 543705 net.go:770] primary dev: ETH0
I0320 04:41:43.422605 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:41:43.422616 543705 net.go:698] Add success.
I0320 04:41:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:41:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:41:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:41:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:41:53.409778 543705 memory.go:184] no items to output this cycle
I0320 04:41:53.409782 543705 cpu.go:275] no items to output this cycle
E0320 04:42:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:42:03.409773 543705 memory.go:184] no items to output this cycle
I0320 04:42:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 04:42:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:42:13.409810 543705 memory.go:191] Add success.
I0320 04:42:13.409817 543705 cpu.go:282] Add success.
W0320 04:42:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:42:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:42:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:42:13.420074 543705 net.go:648] Add success.
I0320 04:42:13.422470 543705 net.go:770] primary dev: ETH0
I0320 04:42:13.422483 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:42:13.422494 543705 net.go:698] Add success.
I0320 04:42:13.469230 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3ad8088a-8c39-45ee-a0c2-361d2ba7e10c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 04:42:13.469265 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 04:42:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:42:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 04:42:14.455187 543705 disk_worker.go:728] disk inode is not compliant
E0320 04:42:14.457027 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 04:42:14.457047 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 04:42:14.457053 543705 custom_config.go:64] query custom config with name: gpu
I0320 04:42:14.457096 543705 disk_worker.go:494] system disk:vda1
I0320 04:42:14.457142 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 04:42:15.456449 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 04:42:15.456458 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:42:16.457939 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 04:42:16.457939 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 04:42:16.457993 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:42:16.458024 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:42:16.472344 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:42:22.614724 543705 disk_info.go:125] begin check local disk info of client
I0320 04:42:22.617118 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:42:22.617124 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000ec580 0xc0000ec5c0]
E0320 04:42:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:42:23.409787 543705 memory.go:184] no items to output this cycle
I0320 04:42:23.409801 543705 cpu.go:275] no items to output this cycle
E0320 04:42:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:42:33.409806 543705 memory.go:184] no items to output this cycle
I0320 04:42:33.409815 543705 cpu.go:275] no items to output this cycle
I0320 04:42:38.125734 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 04:42:38.125740 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 04:42:43.409865 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:42:43.410872 543705 memory.go:191] Add success.
I0320 04:42:43.409881 543705 cpu.go:282] Add success.
I0320 04:42:43.419777 543705 net.go:648] Add success.
I0320 04:42:43.422202 543705 net.go:770] primary dev: ETH0
I0320 04:42:43.422215 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:42:43.422227 543705 net.go:698] Add success.
I0320 04:42:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:42:46.458067 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:42:46.458093 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:42:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:42:53.409787 543705 cpu.go:275] no items to output this cycle
I0320 04:42:53.409792 543705 memory.go:184] no items to output this cycle
E0320 04:43:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:43:03.409805 543705 memory.go:184] no items to output this cycle
I0320 04:43:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 04:43:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:43:13.409799 543705 memory.go:191] Add success.
I0320 04:43:13.409799 543705 cpu.go:282] Add success.
W0320 04:43:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:43:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:43:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:43:13.420256 543705 net.go:648] Add success.
I0320 04:43:13.423334 543705 net.go:770] primary dev: ETH0
I0320 04:43:13.423348 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:43:13.423363 543705 net.go:698] Add success.
I0320 04:43:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:43:14.455137 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:43:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0320 04:43:14.455218 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:43:14.456581 543705 disk_worker.go:494] system disk:vda1
I0320 04:43:14.456612 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:43:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:43:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:43:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:43:16.458077 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:43:16.472408 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:43:22.617675 543705 disk_info.go:125] begin check local disk info of client
I0320 04:43:22.620224 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:43:22.620231 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c59c0 0xc0000c5a00]
E0320 04:43:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:43:23.409793 543705 memory.go:184] no items to output this cycle
I0320 04:43:23.409800 543705 cpu.go:275] no items to output this cycle
E0320 04:43:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:43:33.409794 543705 memory.go:184] no items to output this cycle
I0320 04:43:33.409803 543705 cpu.go:275] no items to output this cycle
E0320 04:43:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:43:43.409794 543705 memory.go:191] Add success.
I0320 04:43:43.409803 543705 cpu.go:282] Add success.
I0320 04:43:43.419865 543705 net.go:648] Add success.
I0320 04:43:43.422690 543705 net.go:770] primary dev: ETH0
I0320 04:43:43.422702 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:43:43.422714 543705 net.go:698] Add success.
I0320 04:43:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:43:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:43:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:43:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:43:53.409778 543705 memory.go:184] no items to output this cycle
I0320 04:43:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 04:44:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:44:03.409781 543705 memory.go:184] no items to output this cycle
I0320 04:44:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 04:44:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:44:13.409816 543705 memory.go:191] Add success.
I0320 04:44:13.409823 543705 cpu.go:282] Add success.
W0320 04:44:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:44:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:44:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:44:13.420262 543705 net.go:648] Add success.
I0320 04:44:13.423380 543705 net.go:770] primary dev: ETH0
I0320 04:44:13.423394 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:44:13.423408 543705 net.go:698] Add success.
I0320 04:44:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:44:14.455145 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:44:14.455155 543705 disk_worker.go:708] disk space is not compliant
W0320 04:44:14.455157 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:44:14.456480 543705 disk_worker.go:494] system disk:vda1
I0320 04:44:14.456523 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:44:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:44:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:44:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:44:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:44:16.472400 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:44:22.620760 543705 disk_info.go:125] begin check local disk info of client
I0320 04:44:22.623204 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:44:22.623211 543705 disk_info.go:196] parse disk info done, disk is : [0xc000377180 0xc0003771c0]
E0320 04:44:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:44:23.409789 543705 memory.go:184] no items to output this cycle
I0320 04:44:23.409992 543705 cpu.go:275] no items to output this cycle
E0320 04:44:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:44:33.409778 543705 memory.go:184] no items to output this cycle
I0320 04:44:33.409794 543705 cpu.go:275] no items to output this cycle
E0320 04:44:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:44:43.409818 543705 memory.go:191] Add success.
I0320 04:44:43.409820 543705 cpu.go:282] Add success.
I0320 04:44:43.420002 543705 net.go:648] Add success.
I0320 04:44:43.422759 543705 net.go:770] primary dev: ETH0
I0320 04:44:43.422772 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:44:43.422785 543705 net.go:698] Add success.
I0320 04:44:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:44:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:44:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:44:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:44:53.409765 543705 memory.go:184] no items to output this cycle
I0320 04:44:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 04:45:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:45:03.409807 543705 memory.go:184] no items to output this cycle
I0320 04:45:03.409819 543705 cpu.go:275] no items to output this cycle
E0320 04:45:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:45:13.409781 543705 memory.go:191] Add success.
I0320 04:45:13.409804 543705 cpu.go:282] Add success.
W0320 04:45:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:45:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:45:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:45:13.420112 543705 net.go:648] Add success.
I0320 04:45:13.423191 543705 net.go:770] primary dev: ETH0
I0320 04:45:13.423204 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:45:13.423215 543705 net.go:698] Add success.
I0320 04:45:13.463399 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0e024982-104c-4eab-8035-e4235e354e4d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 04:45:13.463434 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 04:45:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:45:14.455105 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:45:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0320 04:45:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:45:14.456531 543705 disk_worker.go:494] system disk:vda1
I0320 04:45:14.456574 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:45:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:45:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:45:16.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:45:16.458082 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:45:16.472494 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:45:22.623761 543705 disk_info.go:125] begin check local disk info of client
I0320 04:45:22.626385 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:45:22.626393 543705 disk_info.go:196] parse disk info done, disk is : [0xc000364d40 0xc000364d80]
E0320 04:45:23.407883 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:45:23.407897 543705 memory.go:184] no items to output this cycle
I0320 04:45:23.407898 543705 cpu.go:275] no items to output this cycle
E0320 04:45:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:45:33.409781 543705 memory.go:184] no items to output this cycle
I0320 04:45:33.409780 543705 cpu.go:275] no items to output this cycle
I0320 04:45:38.125879 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 04:45:38.125886 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 04:45:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:45:43.410723 543705 memory.go:191] Add success.
I0320 04:45:43.409804 543705 cpu.go:282] Add success.
I0320 04:45:43.420447 543705 net.go:648] Add success.
I0320 04:45:43.423377 543705 net.go:770] primary dev: ETH0
I0320 04:45:43.423389 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:45:43.423403 543705 net.go:698] Add success.
I0320 04:45:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:45:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:45:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:45:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:45:53.409778 543705 memory.go:184] no items to output this cycle
I0320 04:45:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 04:46:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:46:03.409785 543705 memory.go:184] no items to output this cycle
I0320 04:46:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 04:46:13.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:46:13.409773 543705 memory.go:191] Add success.
W0320 04:46:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 04:46:13.409807 543705 cpu.go:282] Add success.
W0320 04:46:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:46:13.409819 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:46:13.420104 543705 net.go:648] Add success.
I0320 04:46:13.422836 543705 net.go:770] primary dev: ETH0
I0320 04:46:13.422850 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:46:13.422865 543705 net.go:698] Add success.
I0320 04:46:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:46:14.455139 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:46:14.455148 543705 disk_worker.go:708] disk space is not compliant
W0320 04:46:14.455151 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:46:14.456510 543705 disk_worker.go:494] system disk:vda1
I0320 04:46:14.456558 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:46:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:46:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:46:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:46:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:46:16.472418 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:46:22.626806 543705 disk_info.go:125] begin check local disk info of client
I0320 04:46:22.629222 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:46:22.629229 543705 disk_info.go:196] parse disk info done, disk is : [0xc000508a40 0xc000508a80]
E0320 04:46:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:46:23.409797 543705 memory.go:184] no items to output this cycle
I0320 04:46:23.409811 543705 cpu.go:275] no items to output this cycle
E0320 04:46:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:46:33.409800 543705 memory.go:184] no items to output this cycle
I0320 04:46:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 04:46:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:46:43.409783 543705 memory.go:191] Add success.
I0320 04:46:43.409806 543705 cpu.go:282] Add success.
I0320 04:46:43.419866 543705 net.go:648] Add success.
I0320 04:46:43.422815 543705 net.go:770] primary dev: ETH0
I0320 04:46:43.422830 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:46:43.422846 543705 net.go:698] Add success.
I0320 04:46:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:46:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:46:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:46:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:46:53.409803 543705 memory.go:184] no items to output this cycle
I0320 04:46:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 04:47:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:47:03.409800 543705 memory.go:184] no items to output this cycle
I0320 04:47:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 04:47:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:47:13.409795 543705 memory.go:191] Add success.
I0320 04:47:13.409795 543705 cpu.go:282] Add success.
W0320 04:47:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:47:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:47:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:47:13.420117 543705 net.go:648] Add success.
I0320 04:47:13.422685 543705 net.go:770] primary dev: ETH0
I0320 04:47:13.422699 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:47:13.422710 543705 net.go:698] Add success.
I0320 04:47:13.453270 543705 event_worker.go:152] Polling the log file for events...
W0320 04:47:14.455149 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:47:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0320 04:47:14.455164 543705 disk_worker.go:728] disk inode is not compliant
E0320 04:47:14.456146 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 04:47:14.456155 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 04:47:14.456161 543705 custom_config.go:64] query custom config with name: gpu
I0320 04:47:14.456475 543705 disk_worker.go:494] system disk:vda1
I0320 04:47:14.456505 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 04:47:15.456807 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 04:47:15.456816 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:47:16.457947 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 04:47:16.457958 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 04:47:16.457998 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:47:16.458016 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:47:16.472336 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:47:22.629675 543705 disk_info.go:125] begin check local disk info of client
I0320 04:47:22.632201 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:47:22.632209 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039e000 0xc00039ef40]
E0320 04:47:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:47:23.409787 543705 memory.go:184] no items to output this cycle
I0320 04:47:23.409800 543705 cpu.go:275] no items to output this cycle
E0320 04:47:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:47:33.409782 543705 memory.go:184] no items to output this cycle
I0320 04:47:33.409787 543705 cpu.go:275] no items to output this cycle
E0320 04:47:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:47:43.409794 543705 memory.go:191] Add success.
I0320 04:47:43.409795 543705 cpu.go:282] Add success.
I0320 04:47:43.419866 543705 net.go:648] Add success.
I0320 04:47:43.422517 543705 net.go:770] primary dev: ETH0
I0320 04:47:43.422536 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:47:43.422552 543705 net.go:698] Add success.
I0320 04:47:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:47:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:47:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:47:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:47:53.409773 543705 memory.go:184] no items to output this cycle
I0320 04:47:53.409776 543705 cpu.go:275] no items to output this cycle
E0320 04:48:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:48:03.409808 543705 memory.go:184] no items to output this cycle
I0320 04:48:03.409819 543705 cpu.go:275] no items to output this cycle
E0320 04:48:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:48:13.409786 543705 memory.go:191] Add success.
I0320 04:48:13.409807 543705 cpu.go:282] Add success.
W0320 04:48:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:48:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:48:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:48:13.419941 543705 net.go:770] primary dev: ETH0
I0320 04:48:13.419955 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:48:13.419967 543705 net.go:698] Add success.
I0320 04:48:13.420212 543705 net.go:648] Add success.
I0320 04:48:13.476933 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b16f167e-94a4-4769-a58b-30b980fdd6a0","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 04:48:13.476967 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 04:48:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:48:14.455222 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:48:14.455235 543705 disk_worker.go:708] disk space is not compliant
W0320 04:48:14.455237 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:48:14.456810 543705 disk_worker.go:494] system disk:vda1
I0320 04:48:14.456840 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:48:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:48:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:48:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:48:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:48:16.472372 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:48:22.632825 543705 disk_info.go:125] begin check local disk info of client
I0320 04:48:22.635309 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:48:22.635316 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b2440 0xc0003b2480]
E0320 04:48:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:48:23.409763 543705 memory.go:184] no items to output this cycle
I0320 04:48:23.409803 543705 cpu.go:275] no items to output this cycle
E0320 04:48:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:48:33.409785 543705 memory.go:184] no items to output this cycle
I0320 04:48:33.409799 543705 cpu.go:275] no items to output this cycle
I0320 04:48:38.129333 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 04:48:38.129339 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 04:48:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:48:43.410574 543705 memory.go:191] Add success.
I0320 04:48:43.409800 543705 cpu.go:282] Add success.
I0320 04:48:43.420297 543705 net.go:648] Add success.
I0320 04:48:43.422925 543705 net.go:770] primary dev: ETH0
I0320 04:48:43.422938 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:48:43.422951 543705 net.go:698] Add success.
I0320 04:48:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:48:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:48:46.458088 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:48:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:48:53.409794 543705 memory.go:184] no items to output this cycle
I0320 04:48:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 04:49:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:49:03.409808 543705 memory.go:184] no items to output this cycle
I0320 04:49:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 04:49:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:49:13.409776 543705 memory.go:191] Add success.
I0320 04:49:13.409801 543705 cpu.go:282] Add success.
W0320 04:49:13.409803 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:49:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:49:13.409817 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:49:13.420070 543705 net.go:648] Add success.
I0320 04:49:13.422535 543705 net.go:770] primary dev: ETH0
I0320 04:49:13.422550 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:49:13.422563 543705 net.go:698] Add success.
I0320 04:49:14.454611 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:49:14.454781 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:49:14.454867 543705 disk_worker.go:708] disk space is not compliant
W0320 04:49:14.454870 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:49:14.456253 543705 disk_worker.go:494] system disk:vda1
I0320 04:49:14.456285 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:49:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:49:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:49:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:49:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:49:16.472448 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:49:22.635396 543705 disk_info.go:125] begin check local disk info of client
I0320 04:49:22.637966 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:49:22.637974 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000ed4c0 0xc0000ed500]
E0320 04:49:23.409850 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:49:23.409866 543705 memory.go:184] no items to output this cycle
I0320 04:49:23.410002 543705 cpu.go:275] no items to output this cycle
E0320 04:49:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:49:33.409801 543705 memory.go:184] no items to output this cycle
I0320 04:49:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 04:49:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:49:43.409791 543705 memory.go:191] Add success.
I0320 04:49:43.409792 543705 cpu.go:282] Add success.
I0320 04:49:43.420017 543705 net.go:648] Add success.
I0320 04:49:43.422867 543705 net.go:770] primary dev: ETH0
I0320 04:49:43.422880 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:49:43.422892 543705 net.go:698] Add success.
I0320 04:49:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:49:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:49:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:49:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:49:53.409797 543705 memory.go:184] no items to output this cycle
I0320 04:49:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 04:50:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:50:03.409784 543705 cpu.go:275] no items to output this cycle
I0320 04:50:03.409790 543705 memory.go:184] no items to output this cycle
E0320 04:50:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:50:13.409815 543705 memory.go:191] Add success.
I0320 04:50:13.409821 543705 cpu.go:282] Add success.
W0320 04:50:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:50:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:50:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:50:13.420164 543705 net.go:648] Add success.
I0320 04:50:13.423261 543705 net.go:770] primary dev: ETH0
I0320 04:50:13.423276 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:50:13.423289 543705 net.go:698] Add success.
I0320 04:50:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:50:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:50:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 04:50:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:50:14.456556 543705 disk_worker.go:494] system disk:vda1
I0320 04:50:14.456587 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:50:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:50:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:50:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:50:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:50:16.472378 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:50:22.638944 543705 disk_info.go:125] begin check local disk info of client
I0320 04:50:22.641331 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:50:22.641338 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001dd100 0xc0001dd140]
E0320 04:50:23.407892 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:50:23.407912 543705 memory.go:184] no items to output this cycle
I0320 04:50:23.407924 543705 cpu.go:275] no items to output this cycle
E0320 04:50:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:50:33.409782 543705 memory.go:184] no items to output this cycle
I0320 04:50:33.409800 543705 cpu.go:275] no items to output this cycle
E0320 04:50:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:50:43.409821 543705 memory.go:191] Add success.
I0320 04:50:43.409823 543705 cpu.go:282] Add success.
I0320 04:50:43.419980 543705 net.go:648] Add success.
I0320 04:50:43.422518 543705 net.go:770] primary dev: ETH0
I0320 04:50:43.422531 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:50:43.422543 543705 net.go:698] Add success.
I0320 04:50:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:50:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:50:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:50:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:50:53.409796 543705 memory.go:184] no items to output this cycle
I0320 04:50:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 04:51:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:51:03.409782 543705 memory.go:184] no items to output this cycle
I0320 04:51:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 04:51:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:51:13.409792 543705 memory.go:191] Add success.
I0320 04:51:13.409797 543705 cpu.go:282] Add success.
W0320 04:51:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:51:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:51:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:51:13.420081 543705 net.go:648] Add success.
I0320 04:51:13.422662 543705 net.go:770] primary dev: ETH0
I0320 04:51:13.422677 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:51:13.422691 543705 net.go:698] Add success.
I0320 04:51:13.464560 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c419629a-04ea-4cce-9fa4-cadc49021b3a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 04:51:13.464594 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 04:51:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:51:14.455194 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:51:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0320 04:51:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:51:14.456694 543705 disk_worker.go:494] system disk:vda1
I0320 04:51:14.456738 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:51:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:51:16.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:51:16.458066 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:51:16.458105 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:51:16.472523 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:51:22.641676 543705 disk_info.go:125] begin check local disk info of client
I0320 04:51:22.644151 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:51:22.644157 543705 disk_info.go:196] parse disk info done, disk is : [0xc000396f00 0xc000396f40]
E0320 04:51:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:51:23.409795 543705 memory.go:184] no items to output this cycle
I0320 04:51:23.409798 543705 cpu.go:275] no items to output this cycle
E0320 04:51:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:51:33.409784 543705 memory.go:184] no items to output this cycle
I0320 04:51:33.409800 543705 cpu.go:275] no items to output this cycle
I0320 04:51:38.129735 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 04:51:38.129741 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 04:51:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:51:43.410631 543705 memory.go:191] Add success.
I0320 04:51:43.409796 543705 cpu.go:282] Add success.
I0320 04:51:43.420341 543705 net.go:648] Add success.
I0320 04:51:43.422970 543705 net.go:770] primary dev: ETH0
I0320 04:51:43.422984 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:51:43.422998 543705 net.go:698] Add success.
I0320 04:51:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:51:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:51:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:51:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:51:53.409766 543705 memory.go:184] no items to output this cycle
I0320 04:51:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 04:52:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:52:03.409785 543705 memory.go:184] no items to output this cycle
I0320 04:52:03.409790 543705 cpu.go:275] no items to output this cycle
W0320 04:52:13.409707 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:52:13.409724 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:52:13.409728 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:52:13.409801 543705 cpu.go:282] Add success.
E0320 04:52:13.409820 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:52:13.409842 543705 memory.go:191] Add success.
I0320 04:52:13.420058 543705 net.go:648] Add success.
I0320 04:52:13.422683 543705 net.go:770] primary dev: ETH0
I0320 04:52:13.422696 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:52:13.422709 543705 net.go:698] Add success.
W0320 04:52:14.455132 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:52:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 04:52:14.455198 543705 disk_worker.go:728] disk inode is not compliant
E0320 04:52:14.455859 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 04:52:14.455866 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 04:52:14.455871 543705 custom_config.go:64] query custom config with name: gpu
I0320 04:52:14.456785 543705 disk_worker.go:494] system disk:vda1
I0320 04:52:14.456815 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 04:52:15.456923 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 04:52:15.456936 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:52:16.457940 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 04:52:16.457939 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 04:52:16.457993 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:52:16.458013 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:52:16.472369 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:52:22.644863 543705 disk_info.go:125] begin check local disk info of client
I0320 04:52:22.647231 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:52:22.647239 543705 disk_info.go:196] parse disk info done, disk is : [0xc00028a7c0 0xc00028a800]
E0320 04:52:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:52:23.409772 543705 memory.go:184] no items to output this cycle
I0320 04:52:23.409775 543705 cpu.go:275] no items to output this cycle
I0320 04:52:33.409887 543705 cpu.go:275] no items to output this cycle
E0320 04:52:33.409888 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:52:33.409904 543705 memory.go:184] no items to output this cycle
E0320 04:52:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:52:43.409794 543705 memory.go:191] Add success.
I0320 04:52:43.409804 543705 cpu.go:282] Add success.
I0320 04:52:43.420017 543705 net.go:648] Add success.
I0320 04:52:43.422563 543705 net.go:770] primary dev: ETH0
I0320 04:52:43.422576 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:52:43.422588 543705 net.go:698] Add success.
I0320 04:52:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:52:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:52:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:52:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:52:53.409777 543705 memory.go:184] no items to output this cycle
I0320 04:52:53.409784 543705 cpu.go:275] no items to output this cycle
E0320 04:53:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:53:03.409805 543705 memory.go:184] no items to output this cycle
I0320 04:53:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 04:53:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:53:13.409781 543705 memory.go:191] Add success.
I0320 04:53:13.409803 543705 cpu.go:282] Add success.
W0320 04:53:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:53:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:53:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:53:13.420117 543705 net.go:648] Add success.
I0320 04:53:13.423136 543705 net.go:770] primary dev: ETH0
I0320 04:53:13.423149 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:53:13.423175 543705 net.go:698] Add success.
I0320 04:53:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:53:14.455103 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:53:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 04:53:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:53:14.456532 543705 disk_worker.go:494] system disk:vda1
I0320 04:53:14.456583 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:53:15.455974 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:53:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:53:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:53:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:53:16.472434 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:53:22.647879 543705 disk_info.go:125] begin check local disk info of client
I0320 04:53:22.650378 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:53:22.650384 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b640 0xc00007b680]
E0320 04:53:23.407876 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:53:23.407890 543705 memory.go:184] no items to output this cycle
I0320 04:53:23.407898 543705 cpu.go:275] no items to output this cycle
E0320 04:53:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:53:33.409803 543705 memory.go:184] no items to output this cycle
I0320 04:53:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 04:53:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:53:43.409793 543705 cpu.go:282] Add success.
I0320 04:53:43.409801 543705 memory.go:191] Add success.
I0320 04:53:43.419943 543705 net.go:648] Add success.
I0320 04:53:43.422571 543705 net.go:770] primary dev: ETH0
I0320 04:53:43.422584 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:53:43.422595 543705 net.go:698] Add success.
I0320 04:53:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:53:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:53:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:53:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:53:53.409768 543705 memory.go:184] no items to output this cycle
I0320 04:53:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 04:54:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:54:03.409787 543705 memory.go:184] no items to output this cycle
I0320 04:54:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 04:54:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:54:13.409779 543705 memory.go:191] Add success.
I0320 04:54:13.409803 543705 cpu.go:282] Add success.
W0320 04:54:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:54:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:54:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:54:13.420120 543705 net.go:648] Add success.
I0320 04:54:13.422705 543705 net.go:770] primary dev: ETH0
I0320 04:54:13.422719 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:54:13.422732 543705 net.go:698] Add success.
I0320 04:54:13.469075 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"75c76bf2-1da2-4de6-a56d-ab17746109d1","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 04:54:13.469107 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 04:54:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:54:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:54:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0320 04:54:14.455170 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:54:14.456497 543705 disk_worker.go:494] system disk:vda1
I0320 04:54:14.456545 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:54:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:54:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:54:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:54:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:54:16.472400 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:54:22.650470 543705 disk_info.go:125] begin check local disk info of client
I0320 04:54:22.652891 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:54:22.652897 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032b780 0xc00032b7c0]
E0320 04:54:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:54:23.409796 543705 memory.go:184] no items to output this cycle
I0320 04:54:23.409806 543705 cpu.go:275] no items to output this cycle
E0320 04:54:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:54:33.409778 543705 memory.go:184] no items to output this cycle
I0320 04:54:33.409793 543705 cpu.go:275] no items to output this cycle
I0320 04:54:38.133363 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 04:54:38.133369 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 04:54:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:54:43.410689 543705 memory.go:191] Add success.
I0320 04:54:43.409825 543705 cpu.go:282] Add success.
I0320 04:54:43.420388 543705 net.go:648] Add success.
I0320 04:54:43.423193 543705 net.go:770] primary dev: ETH0
I0320 04:54:43.423205 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:54:43.423218 543705 net.go:698] Add success.
I0320 04:54:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:54:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:54:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:54:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:54:53.409802 543705 memory.go:184] no items to output this cycle
I0320 04:54:53.409814 543705 cpu.go:275] no items to output this cycle
E0320 04:55:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:55:03.409787 543705 memory.go:184] no items to output this cycle
I0320 04:55:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 04:55:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:55:13.409791 543705 memory.go:191] Add success.
I0320 04:55:13.409795 543705 cpu.go:282] Add success.
W0320 04:55:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:55:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:55:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:55:13.420113 543705 net.go:648] Add success.
I0320 04:55:13.422534 543705 net.go:770] primary dev: ETH0
I0320 04:55:13.422547 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:55:13.422558 543705 net.go:698] Add success.
I0320 04:55:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:55:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:55:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0320 04:55:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:55:14.456589 543705 disk_worker.go:494] system disk:vda1
I0320 04:55:14.456619 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:55:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:55:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:55:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:55:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:55:16.472365 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:55:22.653675 543705 disk_info.go:125] begin check local disk info of client
I0320 04:55:22.656246 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:55:22.656254 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034e100 0xc00034e140]
E0320 04:55:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:55:23.409767 543705 cpu.go:275] no items to output this cycle
I0320 04:55:23.409775 543705 memory.go:184] no items to output this cycle
E0320 04:55:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:55:33.409799 543705 memory.go:184] no items to output this cycle
I0320 04:55:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 04:55:43.409885 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:55:43.409908 543705 cpu.go:282] Add success.
I0320 04:55:43.409932 543705 memory.go:191] Add success.
I0320 04:55:43.419713 543705 net.go:648] Add success.
I0320 04:55:43.422481 543705 net.go:770] primary dev: ETH0
I0320 04:55:43.422493 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:55:43.422506 543705 net.go:698] Add success.
I0320 04:55:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:55:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:55:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:55:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:55:53.409767 543705 memory.go:184] no items to output this cycle
I0320 04:55:53.409783 543705 cpu.go:275] no items to output this cycle
E0320 04:56:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:56:03.409788 543705 memory.go:184] no items to output this cycle
I0320 04:56:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 04:56:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:56:13.409791 543705 memory.go:191] Add success.
I0320 04:56:13.409810 543705 cpu.go:282] Add success.
W0320 04:56:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:56:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:56:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:56:13.420222 543705 net.go:648] Add success.
I0320 04:56:13.423234 543705 net.go:770] primary dev: ETH0
I0320 04:56:13.423247 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:56:13.423260 543705 net.go:698] Add success.
I0320 04:56:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:56:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:56:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 04:56:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:56:14.456600 543705 disk_worker.go:494] system disk:vda1
I0320 04:56:14.456630 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:56:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:56:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:56:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:56:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:56:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:56:22.656917 543705 disk_info.go:125] begin check local disk info of client
I0320 04:56:22.659439 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:56:22.659446 543705 disk_info.go:196] parse disk info done, disk is : [0xc000228080 0xc0002280c0]
E0320 04:56:23.407879 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:56:23.407899 543705 memory.go:184] no items to output this cycle
I0320 04:56:23.407903 543705 cpu.go:275] no items to output this cycle
E0320 04:56:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:56:33.409783 543705 memory.go:184] no items to output this cycle
I0320 04:56:33.409799 543705 cpu.go:275] no items to output this cycle
E0320 04:56:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:56:43.409791 543705 memory.go:191] Add success.
I0320 04:56:43.409804 543705 cpu.go:282] Add success.
I0320 04:56:43.420303 543705 net.go:648] Add success.
I0320 04:56:43.423091 543705 net.go:770] primary dev: ETH0
I0320 04:56:43.423104 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:56:43.423115 543705 net.go:698] Add success.
I0320 04:56:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:56:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:56:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:56:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:56:53.409779 543705 memory.go:184] no items to output this cycle
I0320 04:56:53.409784 543705 cpu.go:275] no items to output this cycle
E0320 04:57:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:57:03.409810 543705 memory.go:184] no items to output this cycle
I0320 04:57:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 04:57:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:57:13.409815 543705 memory.go:191] Add success.
I0320 04:57:13.409825 543705 cpu.go:282] Add success.
W0320 04:57:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:57:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:57:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:57:13.420349 543705 net.go:648] Add success.
I0320 04:57:13.428764 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 04:57:13.428838 543705 net.go:770] primary dev: ETH0
I0320 04:57:13.428849 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:57:13.428861 543705 net.go:698] Add success.
I0320 04:57:13.453384 543705 event_worker.go:152] Polling the log file for events...
I0320 04:57:14.300172 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a371197b-008f-437c-8538-87a69d5e3e39","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 04:57:14.300207 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 04:57:14.454310 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:57:14.454322 543705 disk_worker.go:708] disk space is not compliant
W0320 04:57:14.454326 543705 disk_worker.go:728] disk inode is not compliant
E0320 04:57:14.454810 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 04:57:14.454820 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 04:57:14.454825 543705 custom_config.go:64] query custom config with name: gpu
I0320 04:57:14.455861 543705 disk_worker.go:494] system disk:vda1
I0320 04:57:14.455891 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 04:57:15.456810 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 04:57:15.456818 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:57:16.457933 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 04:57:16.457933 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 04:57:16.457988 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:57:16.458008 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:57:16.472330 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:57:22.659948 543705 disk_info.go:125] begin check local disk info of client
I0320 04:57:22.662541 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:57:22.662548 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003d0800 0xc0003d0840]
E0320 04:57:23.407510 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:57:23.407525 543705 memory.go:184] no items to output this cycle
I0320 04:57:23.407555 543705 cpu.go:275] no items to output this cycle
E0320 04:57:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:57:33.409770 543705 memory.go:184] no items to output this cycle
I0320 04:57:33.409798 543705 cpu.go:275] no items to output this cycle
I0320 04:57:38.133742 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 04:57:38.133749 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 04:57:43.409923 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:57:43.410600 543705 memory.go:191] Add success.
I0320 04:57:43.409923 543705 cpu.go:282] Add success.
I0320 04:57:43.419732 543705 net.go:648] Add success.
I0320 04:57:43.422536 543705 net.go:770] primary dev: ETH0
I0320 04:57:43.422550 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:57:43.422564 543705 net.go:698] Add success.
I0320 04:57:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:57:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:57:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:57:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:57:53.409774 543705 memory.go:184] no items to output this cycle
I0320 04:57:53.409778 543705 cpu.go:275] no items to output this cycle
E0320 04:58:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:58:03.409807 543705 memory.go:184] no items to output this cycle
I0320 04:58:03.409822 543705 cpu.go:275] no items to output this cycle
E0320 04:58:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:58:13.409799 543705 memory.go:191] Add success.
I0320 04:58:13.409815 543705 cpu.go:282] Add success.
W0320 04:58:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:58:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:58:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:58:13.420142 543705 net.go:648] Add success.
I0320 04:58:13.423051 543705 net.go:770] primary dev: ETH0
I0320 04:58:13.423063 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:58:13.423075 543705 net.go:698] Add success.
I0320 04:58:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:58:14.455140 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:58:14.455151 543705 disk_worker.go:708] disk space is not compliant
W0320 04:58:14.455153 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:58:14.456479 543705 disk_worker.go:494] system disk:vda1
I0320 04:58:14.456524 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:58:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:58:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:58:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:58:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:58:16.472379 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:58:22.662944 543705 disk_info.go:125] begin check local disk info of client
I0320 04:58:22.665391 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:58:22.665397 543705 disk_info.go:196] parse disk info done, disk is : [0xc000347c40 0xc000347c80]
E0320 04:58:23.407905 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:58:23.407924 543705 memory.go:184] no items to output this cycle
I0320 04:58:23.407937 543705 cpu.go:275] no items to output this cycle
E0320 04:58:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:58:33.409804 543705 memory.go:184] no items to output this cycle
I0320 04:58:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 04:58:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:58:43.409784 543705 memory.go:191] Add success.
I0320 04:58:43.409809 543705 cpu.go:282] Add success.
I0320 04:58:43.419990 543705 net.go:648] Add success.
I0320 04:58:43.422976 543705 net.go:770] primary dev: ETH0
I0320 04:58:43.422989 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:58:43.423000 543705 net.go:698] Add success.
I0320 04:58:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:58:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:58:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:58:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:58:53.409804 543705 memory.go:184] no items to output this cycle
I0320 04:58:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 04:59:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:59:03.409787 543705 memory.go:184] no items to output this cycle
I0320 04:59:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 04:59:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:59:13.409783 543705 memory.go:191] Add success.
I0320 04:59:13.409802 543705 cpu.go:282] Add success.
W0320 04:59:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 04:59:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 04:59:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 04:59:13.420686 543705 net.go:648] Add success.
I0320 04:59:13.423248 543705 net.go:770] primary dev: ETH0
I0320 04:59:13.423259 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:59:13.423270 543705 net.go:698] Add success.
I0320 04:59:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 04:59:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 04:59:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 04:59:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0320 04:59:14.456562 543705 disk_worker.go:494] system disk:vda1
I0320 04:59:14.456590 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 04:59:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 04:59:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:59:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:59:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 04:59:16.472398 543705 disk_local_worker.go:436] Get disk info: []
I0320 04:59:22.665677 543705 disk_info.go:125] begin check local disk info of client
I0320 04:59:22.668231 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 04:59:22.668238 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003460c0 0xc000346100]
E0320 04:59:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:59:23.409792 543705 memory.go:184] no items to output this cycle
I0320 04:59:23.409804 543705 cpu.go:275] no items to output this cycle
E0320 04:59:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:59:33.409785 543705 memory.go:184] no items to output this cycle
I0320 04:59:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 04:59:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:59:43.409796 543705 memory.go:191] Add success.
I0320 04:59:43.409801 543705 cpu.go:282] Add success.
I0320 04:59:43.420034 543705 net.go:648] Add success.
I0320 04:59:43.422763 543705 net.go:770] primary dev: ETH0
I0320 04:59:43.422776 543705 net.go:802] Send network stats successfully!,count is 6
I0320 04:59:43.422787 543705 net.go:698] Add success.
I0320 04:59:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 04:59:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 04:59:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0320 04:59:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 04:59:53.409794 543705 memory.go:184] no items to output this cycle
I0320 04:59:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 05:00:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:00:03.409780 543705 memory.go:184] no items to output this cycle
I0320 05:00:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 05:00:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:00:13.409811 543705 memory.go:191] Add success.
I0320 05:00:13.409811 543705 cpu.go:282] Add success.
W0320 05:00:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:00:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:00:13.409855 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:00:13.420157 543705 net.go:648] Add success.
I0320 05:00:13.422555 543705 net.go:770] primary dev: ETH0
I0320 05:00:13.422573 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:00:13.422589 543705 net.go:698] Add success.
I0320 05:00:13.468911 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a9e2b6c7-9328-47df-a762-b7a17cf7738c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 05:00:13.468947 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 05:00:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:00:14.455212 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:00:14.455222 543705 disk_worker.go:708] disk space is not compliant
W0320 05:00:14.455225 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:00:14.456629 543705 disk_worker.go:494] system disk:vda1
I0320 05:00:14.456660 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:00:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:00:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:00:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:00:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:00:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:00:22.668982 543705 disk_info.go:125] begin check local disk info of client
I0320 05:00:22.671434 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:00:22.671440 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bc500 0xc0002bc540]
E0320 05:00:23.407541 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:00:23.407559 543705 memory.go:184] no items to output this cycle
I0320 05:00:23.407562 543705 cpu.go:275] no items to output this cycle
E0320 05:00:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:00:33.409783 543705 memory.go:184] no items to output this cycle
I0320 05:00:33.409825 543705 cpu.go:275] no items to output this cycle
I0320 05:00:38.137376 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 05:00:38.137383 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 05:00:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:00:43.410661 543705 memory.go:191] Add success.
I0320 05:00:43.409815 543705 cpu.go:282] Add success.
I0320 05:00:43.420539 543705 net.go:648] Add success.
I0320 05:00:43.423305 543705 net.go:770] primary dev: ETH0
I0320 05:00:43.423324 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:00:43.423342 543705 net.go:698] Add success.
I0320 05:00:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:00:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:00:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:00:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:00:53.409781 543705 memory.go:184] no items to output this cycle
I0320 05:00:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 05:01:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:01:03.409796 543705 memory.go:184] no items to output this cycle
I0320 05:01:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 05:01:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:01:13.409804 543705 memory.go:191] Add success.
I0320 05:01:13.409826 543705 cpu.go:282] Add success.
W0320 05:01:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:01:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:01:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:01:13.420147 543705 net.go:648] Add success.
I0320 05:01:13.422757 543705 net.go:770] primary dev: ETH0
I0320 05:01:13.422771 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:01:13.422791 543705 net.go:698] Add success.
I0320 05:01:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:01:14.455133 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:01:14.455216 543705 disk_worker.go:708] disk space is not compliant
W0320 05:01:14.455221 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:01:14.456616 543705 disk_worker.go:494] system disk:vda1
I0320 05:01:14.456647 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:01:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:01:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:01:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:01:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:01:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:01:22.671997 543705 disk_info.go:125] begin check local disk info of client
I0320 05:01:22.674562 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:01:22.674569 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bdd40 0xc0002bdd80]
E0320 05:01:23.407511 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:01:23.407527 543705 memory.go:184] no items to output this cycle
I0320 05:01:23.407549 543705 cpu.go:275] no items to output this cycle
E0320 05:01:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:01:33.409779 543705 memory.go:184] no items to output this cycle
I0320 05:01:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 05:01:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:01:43.409776 543705 memory.go:191] Add success.
I0320 05:01:43.409811 543705 cpu.go:282] Add success.
I0320 05:01:43.419995 543705 net.go:648] Add success.
I0320 05:01:43.422863 543705 net.go:770] primary dev: ETH0
I0320 05:01:43.422878 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:01:43.422892 543705 net.go:698] Add success.
I0320 05:01:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:01:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:01:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:01:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:01:53.409782 543705 memory.go:184] no items to output this cycle
I0320 05:01:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 05:02:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:02:03.409774 543705 memory.go:184] no items to output this cycle
I0320 05:02:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 05:02:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:02:13.409794 543705 memory.go:191] Add success.
I0320 05:02:13.409800 543705 cpu.go:282] Add success.
W0320 05:02:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:02:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:02:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:02:13.420166 543705 net.go:648] Add success.
I0320 05:02:13.423330 543705 net.go:770] primary dev: ETH0
I0320 05:02:13.423347 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:02:13.423360 543705 net.go:698] Add success.
W0320 05:02:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:02:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0320 05:02:14.455179 543705 disk_worker.go:728] disk inode is not compliant
E0320 05:02:14.456941 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 05:02:14.456950 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 05:02:14.456956 543705 custom_config.go:64] query custom config with name: gpu
I0320 05:02:14.457011 543705 disk_worker.go:494] system disk:vda1
I0320 05:02:14.457040 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 05:02:15.456847 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 05:02:15.456869 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:02:16.457918 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 05:02:16.457927 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 05:02:16.457970 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:02:16.457987 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:02:16.472315 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:02:22.675013 543705 disk_info.go:125] begin check local disk info of client
I0320 05:02:22.677403 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:02:22.677408 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000ed7c0 0xc0000ed800]
E0320 05:02:23.407527 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:02:23.407543 543705 memory.go:184] no items to output this cycle
I0320 05:02:23.407561 543705 cpu.go:275] no items to output this cycle
E0320 05:02:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:02:33.409802 543705 memory.go:184] no items to output this cycle
I0320 05:02:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 05:02:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:02:43.409790 543705 memory.go:191] Add success.
I0320 05:02:43.409791 543705 cpu.go:282] Add success.
I0320 05:02:43.420258 543705 net.go:648] Add success.
I0320 05:02:43.423445 543705 net.go:770] primary dev: ETH0
I0320 05:02:43.423460 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:02:43.423471 543705 net.go:698] Add success.
I0320 05:02:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:02:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:02:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:02:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:02:53.409786 543705 memory.go:184] no items to output this cycle
I0320 05:02:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 05:03:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:03:03.409792 543705 memory.go:184] no items to output this cycle
I0320 05:03:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 05:03:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:03:13.409801 543705 memory.go:191] Add success.
I0320 05:03:13.409802 543705 cpu.go:282] Add success.
W0320 05:03:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:03:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:03:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:03:13.420200 543705 net.go:648] Add success.
I0320 05:03:13.422786 543705 net.go:770] primary dev: ETH0
I0320 05:03:13.422798 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:03:13.422811 543705 net.go:698] Add success.
I0320 05:03:13.468791 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5408792b-3673-419c-a4df-581289389999","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 05:03:13.468825 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 05:03:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:03:14.455136 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:03:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0320 05:03:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:03:14.456616 543705 disk_worker.go:494] system disk:vda1
I0320 05:03:14.456646 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:03:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:03:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:03:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:03:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:03:16.472410 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:03:22.677677 543705 disk_info.go:125] begin check local disk info of client
I0320 05:03:22.680240 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:03:22.680247 543705 disk_info.go:196] parse disk info done, disk is : [0xc000377600 0xc000377640]
E0320 05:03:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:03:23.409781 543705 memory.go:184] no items to output this cycle
I0320 05:03:23.409798 543705 cpu.go:275] no items to output this cycle
E0320 05:03:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:03:33.409787 543705 memory.go:184] no items to output this cycle
I0320 05:03:33.409792 543705 cpu.go:275] no items to output this cycle
I0320 05:03:38.137734 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 05:03:38.137741 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 05:03:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:03:43.410635 543705 memory.go:191] Add success.
I0320 05:03:43.409834 543705 cpu.go:282] Add success.
I0320 05:03:43.420551 543705 net.go:648] Add success.
I0320 05:03:43.423608 543705 net.go:770] primary dev: ETH0
I0320 05:03:43.423621 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:03:43.423633 543705 net.go:698] Add success.
I0320 05:03:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:03:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:03:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:03:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:03:53.409779 543705 memory.go:184] no items to output this cycle
I0320 05:03:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 05:04:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:04:03.409785 543705 memory.go:184] no items to output this cycle
I0320 05:04:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 05:04:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:04:13.409829 543705 memory.go:191] Add success.
I0320 05:04:13.409833 543705 cpu.go:282] Add success.
W0320 05:04:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:04:13.409880 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:04:13.409884 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:04:13.420275 543705 net.go:648] Add success.
I0320 05:04:13.423391 543705 net.go:770] primary dev: ETH0
I0320 05:04:13.423403 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:04:13.423415 543705 net.go:698] Add success.
I0320 05:04:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:04:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:04:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0320 05:04:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:04:14.456585 543705 disk_worker.go:494] system disk:vda1
I0320 05:04:14.456615 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:04:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:04:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:04:16.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:04:16.458076 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:04:16.472405 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:04:22.681045 543705 disk_info.go:125] begin check local disk info of client
I0320 05:04:22.683529 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:04:22.683536 543705 disk_info.go:196] parse disk info done, disk is : [0xc000509b00 0xc000509b40]
E0320 05:04:23.407869 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:04:23.407887 543705 memory.go:184] no items to output this cycle
I0320 05:04:23.407922 543705 cpu.go:275] no items to output this cycle
E0320 05:04:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:04:33.409770 543705 memory.go:184] no items to output this cycle
I0320 05:04:33.409800 543705 cpu.go:275] no items to output this cycle
E0320 05:04:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:04:43.409783 543705 memory.go:191] Add success.
I0320 05:04:43.409802 543705 cpu.go:282] Add success.
I0320 05:04:43.419956 543705 net.go:648] Add success.
I0320 05:04:43.422849 543705 net.go:770] primary dev: ETH0
I0320 05:04:43.422863 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:04:43.422874 543705 net.go:698] Add success.
I0320 05:04:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:04:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:04:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:04:53.410262 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:04:53.410278 543705 memory.go:184] no items to output this cycle
I0320 05:04:53.410277 543705 cpu.go:275] no items to output this cycle
E0320 05:05:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:05:03.409779 543705 memory.go:184] no items to output this cycle
I0320 05:05:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 05:05:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:05:13.409791 543705 memory.go:191] Add success.
I0320 05:05:13.409795 543705 cpu.go:282] Add success.
W0320 05:05:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:05:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:05:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:05:13.420094 543705 net.go:648] Add success.
I0320 05:05:13.422811 543705 net.go:770] primary dev: ETH0
I0320 05:05:13.422824 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:05:13.422837 543705 net.go:698] Add success.
I0320 05:05:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:05:14.455174 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:05:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 05:05:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:05:14.456556 543705 disk_worker.go:494] system disk:vda1
I0320 05:05:14.456584 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:05:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:05:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:05:16.458023 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:05:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:05:16.472367 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:05:22.683624 543705 disk_info.go:125] begin check local disk info of client
I0320 05:05:22.686209 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:05:22.686216 543705 disk_info.go:196] parse disk info done, disk is : [0xc000462ac0 0xc000462b00]
E0320 05:05:23.409741 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:05:23.409754 543705 memory.go:184] no items to output this cycle
I0320 05:05:23.409795 543705 cpu.go:275] no items to output this cycle
E0320 05:05:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:05:33.409799 543705 memory.go:184] no items to output this cycle
I0320 05:05:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 05:05:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:05:43.409795 543705 memory.go:191] Add success.
I0320 05:05:43.409796 543705 cpu.go:282] Add success.
I0320 05:05:43.420119 543705 net.go:648] Add success.
I0320 05:05:43.423133 543705 net.go:770] primary dev: ETH0
I0320 05:05:43.423146 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:05:43.423158 543705 net.go:698] Add success.
I0320 05:05:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:05:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:05:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:05:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:05:53.409768 543705 memory.go:184] no items to output this cycle
I0320 05:05:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 05:06:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:06:03.409787 543705 memory.go:184] no items to output this cycle
I0320 05:06:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 05:06:13.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:06:13.409779 543705 memory.go:191] Add success.
W0320 05:06:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:06:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:06:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:06:13.409829 543705 cpu.go:282] Add success.
I0320 05:06:13.420036 543705 net.go:648] Add success.
I0320 05:06:13.422898 543705 net.go:770] primary dev: ETH0
I0320 05:06:13.422911 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:06:13.422921 543705 net.go:698] Add success.
I0320 05:06:13.464759 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f728b936-55a0-4a5b-bcf4-f52e90aa56e4","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 05:06:13.464790 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 05:06:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:06:14.455108 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:06:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 05:06:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:06:14.456914 543705 disk_worker.go:494] system disk:vda1
I0320 05:06:14.456946 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:06:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:06:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:06:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:06:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:06:16.472392 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:06:22.686301 543705 disk_info.go:125] begin check local disk info of client
I0320 05:06:22.688812 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:06:22.688818 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002ba300 0xc0002ba340]
E0320 05:06:23.409485 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:06:23.409501 543705 memory.go:184] no items to output this cycle
I0320 05:06:23.409561 543705 cpu.go:275] no items to output this cycle
E0320 05:06:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:06:33.409799 543705 memory.go:184] no items to output this cycle
I0320 05:06:33.409814 543705 cpu.go:275] no items to output this cycle
I0320 05:06:38.141395 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 05:06:38.141402 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 05:06:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:06:43.410613 543705 memory.go:191] Add success.
I0320 05:06:43.409797 543705 cpu.go:282] Add success.
I0320 05:06:43.420195 543705 net.go:770] primary dev: ETH0
I0320 05:06:43.420208 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:06:43.420223 543705 net.go:698] Add success.
I0320 05:06:43.420575 543705 net.go:648] Add success.
I0320 05:06:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:06:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:06:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:06:53.409863 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:06:53.409882 543705 memory.go:184] no items to output this cycle
I0320 05:06:53.409910 543705 cpu.go:275] no items to output this cycle
E0320 05:07:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:07:03.409786 543705 memory.go:184] no items to output this cycle
I0320 05:07:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 05:07:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:07:13.409816 543705 memory.go:191] Add success.
I0320 05:07:13.409819 543705 cpu.go:282] Add success.
W0320 05:07:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:07:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:07:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:07:13.420162 543705 net.go:648] Add success.
I0320 05:07:13.423052 543705 net.go:770] primary dev: ETH0
I0320 05:07:13.423065 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:07:13.423078 543705 net.go:698] Add success.
I0320 05:07:13.453667 543705 event_worker.go:152] Polling the log file for events...
W0320 05:07:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:07:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 05:07:14.455191 543705 disk_worker.go:728] disk inode is not compliant
E0320 05:07:14.455905 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 05:07:14.455913 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 05:07:14.455920 543705 custom_config.go:64] query custom config with name: gpu
I0320 05:07:14.456547 543705 disk_worker.go:494] system disk:vda1
I0320 05:07:14.456588 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 05:07:15.456852 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 05:07:15.456862 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:07:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 05:07:16.457973 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 05:07:16.458019 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:07:16.458035 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:07:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:07:22.689675 543705 disk_info.go:125] begin check local disk info of client
I0320 05:07:22.692203 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:07:22.692210 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4e00 0xc0000c4e40]
E0320 05:07:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:07:23.409796 543705 memory.go:184] no items to output this cycle
I0320 05:07:23.409802 543705 cpu.go:275] no items to output this cycle
E0320 05:07:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:07:33.409779 543705 cpu.go:275] no items to output this cycle
I0320 05:07:33.409780 543705 memory.go:184] no items to output this cycle
E0320 05:07:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:07:43.409816 543705 memory.go:191] Add success.
I0320 05:07:43.409832 543705 cpu.go:282] Add success.
I0320 05:07:43.419973 543705 net.go:648] Add success.
I0320 05:07:43.422568 543705 net.go:770] primary dev: ETH0
I0320 05:07:43.422581 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:07:43.422592 543705 net.go:698] Add success.
I0320 05:07:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:07:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:07:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:07:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:07:53.409774 543705 memory.go:184] no items to output this cycle
I0320 05:07:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 05:08:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:08:03.409795 543705 memory.go:184] no items to output this cycle
I0320 05:08:03.409802 543705 cpu.go:275] no items to output this cycle
E0320 05:08:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:08:13.409795 543705 memory.go:191] Add success.
I0320 05:08:13.409795 543705 cpu.go:282] Add success.
W0320 05:08:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:08:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:08:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:08:13.420136 543705 net.go:648] Add success.
I0320 05:08:13.422991 543705 net.go:770] primary dev: ETH0
I0320 05:08:13.423006 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:08:13.423019 543705 net.go:698] Add success.
I0320 05:08:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:08:14.455154 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:08:14.455163 543705 disk_worker.go:708] disk space is not compliant
W0320 05:08:14.455166 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:08:14.456489 543705 disk_worker.go:494] system disk:vda1
I0320 05:08:14.456533 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:08:15.455976 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:08:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:08:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:08:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:08:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:08:22.693100 543705 disk_info.go:125] begin check local disk info of client
I0320 05:08:22.695566 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:08:22.695572 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002baac0 0xc0002bab00]
E0320 05:08:23.407869 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:08:23.407882 543705 memory.go:184] no items to output this cycle
I0320 05:08:23.407939 543705 cpu.go:275] no items to output this cycle
E0320 05:08:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:08:33.409811 543705 memory.go:184] no items to output this cycle
I0320 05:08:33.409822 543705 cpu.go:275] no items to output this cycle
E0320 05:08:43.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:08:43.409825 543705 memory.go:191] Add success.
I0320 05:08:43.409830 543705 cpu.go:282] Add success.
I0320 05:08:43.420060 543705 net.go:648] Add success.
I0320 05:08:43.422749 543705 net.go:770] primary dev: ETH0
I0320 05:08:43.422764 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:08:43.422776 543705 net.go:698] Add success.
I0320 05:08:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:08:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:08:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:08:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:08:53.409785 543705 memory.go:184] no items to output this cycle
I0320 05:08:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 05:09:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:09:03.409794 543705 memory.go:184] no items to output this cycle
I0320 05:09:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 05:09:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:09:13.409806 543705 memory.go:191] Add success.
I0320 05:09:13.409811 543705 cpu.go:282] Add success.
W0320 05:09:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:09:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:09:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:09:13.420135 543705 net.go:648] Add success.
I0320 05:09:13.422632 543705 net.go:770] primary dev: ETH0
I0320 05:09:13.422644 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:09:13.422656 543705 net.go:698] Add success.
I0320 05:09:13.469193 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"74d01eab-e1fe-4562-b910-5dbb3efc0e52","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 05:09:13.469226 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 05:09:14.454952 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:09:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:09:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 05:09:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:09:14.456663 543705 disk_worker.go:494] system disk:vda1
I0320 05:09:14.456694 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:09:15.455976 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:09:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:09:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:09:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:09:16.472376 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:09:22.696128 543705 disk_info.go:125] begin check local disk info of client
I0320 05:09:22.698711 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:09:22.698718 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000ed200 0xc0000ed240]
E0320 05:09:23.409354 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:09:23.409370 543705 memory.go:184] no items to output this cycle
I0320 05:09:23.409388 543705 cpu.go:275] no items to output this cycle
E0320 05:09:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:09:33.409810 543705 memory.go:184] no items to output this cycle
I0320 05:09:33.409819 543705 cpu.go:275] no items to output this cycle
I0320 05:09:38.141738 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 05:09:38.141755 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 05:09:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:09:43.410560 543705 memory.go:191] Add success.
I0320 05:09:43.409815 543705 cpu.go:282] Add success.
I0320 05:09:43.420254 543705 net.go:648] Add success.
I0320 05:09:43.422843 543705 net.go:770] primary dev: ETH0
I0320 05:09:43.422858 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:09:43.422873 543705 net.go:698] Add success.
I0320 05:09:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:09:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:09:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:09:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:09:53.409775 543705 memory.go:184] no items to output this cycle
I0320 05:09:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 05:10:03.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:10:03.409807 543705 cpu.go:275] no items to output this cycle
I0320 05:10:03.409811 543705 memory.go:184] no items to output this cycle
E0320 05:10:13.410041 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:10:13.410075 543705 memory.go:191] Add success.
I0320 05:10:13.410104 543705 cpu.go:282] Add success.
W0320 05:10:13.410141 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:10:13.410200 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:10:13.410203 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:10:13.419707 543705 net.go:648] Add success.
I0320 05:10:13.422114 543705 net.go:770] primary dev: ETH0
I0320 05:10:13.422126 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:10:13.422138 543705 net.go:698] Add success.
I0320 05:10:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:10:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:10:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0320 05:10:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:10:14.456582 543705 disk_worker.go:494] system disk:vda1
I0320 05:10:14.456611 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:10:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:10:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:10:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:10:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:10:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:10:22.700135 543705 disk_info.go:125] begin check local disk info of client
I0320 05:10:22.702611 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:10:22.702617 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048f040 0xc00048f080]
E0320 05:10:23.407889 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:10:23.407906 543705 memory.go:184] no items to output this cycle
I0320 05:10:23.407920 543705 cpu.go:275] no items to output this cycle
E0320 05:10:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:10:33.409780 543705 memory.go:184] no items to output this cycle
I0320 05:10:33.409783 543705 cpu.go:275] no items to output this cycle
E0320 05:10:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:10:43.409811 543705 memory.go:191] Add success.
I0320 05:10:43.409818 543705 cpu.go:282] Add success.
I0320 05:10:43.419907 543705 net.go:648] Add success.
I0320 05:10:43.422733 543705 net.go:770] primary dev: ETH0
I0320 05:10:43.422746 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:10:43.422760 543705 net.go:698] Add success.
I0320 05:10:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:10:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:10:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:10:53.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:10:53.409806 543705 cpu.go:275] no items to output this cycle
I0320 05:10:53.409808 543705 memory.go:184] no items to output this cycle
E0320 05:11:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:11:03.409810 543705 memory.go:184] no items to output this cycle
I0320 05:11:03.409823 543705 cpu.go:275] no items to output this cycle
E0320 05:11:13.409880 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:11:13.409907 543705 memory.go:191] Add success.
W0320 05:11:13.409936 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:11:13.409948 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:11:13.409955 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:11:13.409971 543705 cpu.go:282] Add success.
I0320 05:11:13.419728 543705 net.go:648] Add success.
I0320 05:11:13.422458 543705 net.go:770] primary dev: ETH0
I0320 05:11:13.422472 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:11:13.422486 543705 net.go:698] Add success.
I0320 05:11:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:11:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:11:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0320 05:11:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:11:14.456574 543705 disk_worker.go:494] system disk:vda1
I0320 05:11:14.456602 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:11:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:11:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:11:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:11:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:11:16.472379 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:11:22.703159 543705 disk_info.go:125] begin check local disk info of client
I0320 05:11:22.705686 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:11:22.705693 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004815c0 0xc000481600]
E0320 05:11:23.409289 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:11:23.409308 543705 memory.go:184] no items to output this cycle
I0320 05:11:23.409323 543705 cpu.go:275] no items to output this cycle
E0320 05:11:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:11:33.409778 543705 memory.go:184] no items to output this cycle
I0320 05:11:33.409783 543705 cpu.go:275] no items to output this cycle
E0320 05:11:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:11:43.409816 543705 memory.go:191] Add success.
I0320 05:11:43.409824 543705 cpu.go:282] Add success.
I0320 05:11:43.420001 543705 net.go:648] Add success.
I0320 05:11:43.422738 543705 net.go:770] primary dev: ETH0
I0320 05:11:43.422752 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:11:43.422765 543705 net.go:698] Add success.
I0320 05:11:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:11:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:11:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:11:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:11:53.409799 543705 memory.go:184] no items to output this cycle
I0320 05:11:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 05:12:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:12:03.409812 543705 memory.go:184] no items to output this cycle
I0320 05:12:03.409822 543705 cpu.go:275] no items to output this cycle
E0320 05:12:13.409842 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:12:13.409868 543705 memory.go:191] Add success.
W0320 05:12:13.409897 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:12:13.409916 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:12:13.409920 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:12:13.409990 543705 cpu.go:282] Add success.
I0320 05:12:13.419712 543705 net.go:648] Add success.
I0320 05:12:13.422291 543705 net.go:770] primary dev: ETH0
I0320 05:12:13.422306 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:12:13.422319 543705 net.go:698] Add success.
I0320 05:12:13.468636 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"efc93293-62cb-4099-9d6c-68c04ff04bb5","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 05:12:13.468669 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 05:12:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:12:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0320 05:12:14.455176 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:12:14.456808 543705 disk_worker.go:494] system disk:vda1
E0320 05:12:14.456823 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 05:12:14.456841 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 05:12:14.456845 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:12:14.456847 543705 custom_config.go:64] query custom config with name: gpu
E0320 05:12:15.456815 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 05:12:15.456824 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:12:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 05:12:16.457995 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 05:12:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:12:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:12:16.472424 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:12:22.707175 543705 disk_info.go:125] begin check local disk info of client
I0320 05:12:22.709660 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:12:22.709669 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab840 0xc0001ab880]
E0320 05:12:23.407849 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:12:23.407862 543705 memory.go:184] no items to output this cycle
I0320 05:12:23.407896 543705 cpu.go:275] no items to output this cycle
E0320 05:12:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:12:33.409776 543705 memory.go:184] no items to output this cycle
I0320 05:12:33.409805 543705 cpu.go:275] no items to output this cycle
I0320 05:12:38.145418 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 05:12:38.145425 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 05:12:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:12:43.410645 543705 memory.go:191] Add success.
I0320 05:12:43.409807 543705 cpu.go:282] Add success.
I0320 05:12:43.420342 543705 net.go:648] Add success.
I0320 05:12:43.422958 543705 net.go:770] primary dev: ETH0
I0320 05:12:43.422983 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:12:43.422998 543705 net.go:698] Add success.
I0320 05:12:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:12:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:12:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:12:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:12:53.409800 543705 memory.go:184] no items to output this cycle
I0320 05:12:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 05:13:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:13:03.409887 543705 cpu.go:275] no items to output this cycle
I0320 05:13:03.409905 543705 memory.go:184] no items to output this cycle
E0320 05:13:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:13:13.409796 543705 memory.go:191] Add success.
I0320 05:13:13.409798 543705 cpu.go:282] Add success.
W0320 05:13:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:13:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:13:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:13:13.420129 543705 net.go:648] Add success.
I0320 05:13:13.422652 543705 net.go:770] primary dev: ETH0
I0320 05:13:13.422665 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:13:13.422677 543705 net.go:698] Add success.
I0320 05:13:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:13:14.455108 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:13:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0320 05:13:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:13:14.456516 543705 disk_worker.go:494] system disk:vda1
I0320 05:13:14.456559 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:13:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:13:16.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:13:16.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:13:16.458083 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:13:16.472418 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:13:22.711201 543705 disk_info.go:125] begin check local disk info of client
I0320 05:13:22.713806 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:13:22.713813 543705 disk_info.go:196] parse disk info done, disk is : [0xc000377580 0xc0003775c0]
E0320 05:13:23.409353 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:13:23.409369 543705 memory.go:184] no items to output this cycle
I0320 05:13:23.409370 543705 cpu.go:275] no items to output this cycle
E0320 05:13:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:13:33.409810 543705 memory.go:184] no items to output this cycle
I0320 05:13:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 05:13:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:13:43.409789 543705 memory.go:191] Add success.
I0320 05:13:43.409808 543705 cpu.go:282] Add success.
I0320 05:13:43.420090 543705 net.go:648] Add success.
I0320 05:13:43.422967 543705 net.go:770] primary dev: ETH0
I0320 05:13:43.422982 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:13:43.422996 543705 net.go:698] Add success.
I0320 05:13:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:13:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:13:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:13:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:13:53.409799 543705 memory.go:184] no items to output this cycle
I0320 05:13:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 05:14:03.409902 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:14:03.410016 543705 memory.go:184] no items to output this cycle
I0320 05:14:03.409936 543705 cpu.go:275] no items to output this cycle
E0320 05:14:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:14:13.409784 543705 memory.go:191] Add success.
W0320 05:14:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 05:14:13.409819 543705 cpu.go:282] Add success.
W0320 05:14:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:14:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:14:13.420331 543705 net.go:648] Add success.
I0320 05:14:13.422802 543705 net.go:770] primary dev: ETH0
I0320 05:14:13.422816 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:14:13.422830 543705 net.go:698] Add success.
I0320 05:14:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:14:14.455200 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:14:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0320 05:14:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:14:14.456586 543705 disk_worker.go:494] system disk:vda1
I0320 05:14:14.456615 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:14:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:14:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:14:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:14:16.458078 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:14:16.472416 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:14:22.715206 543705 disk_info.go:125] begin check local disk info of client
I0320 05:14:22.717726 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:14:22.717732 543705 disk_info.go:196] parse disk info done, disk is : [0xc000377c40 0xc000377c80]
E0320 05:14:23.409239 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:14:23.409255 543705 memory.go:184] no items to output this cycle
I0320 05:14:23.409286 543705 cpu.go:275] no items to output this cycle
E0320 05:14:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:14:33.409803 543705 memory.go:184] no items to output this cycle
I0320 05:14:33.409814 543705 cpu.go:275] no items to output this cycle
E0320 05:14:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:14:43.409782 543705 memory.go:191] Add success.
I0320 05:14:43.409812 543705 cpu.go:282] Add success.
I0320 05:14:43.419697 543705 net.go:770] primary dev: ETH0
I0320 05:14:43.419712 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:14:43.419728 543705 net.go:698] Add success.
I0320 05:14:43.420089 543705 net.go:648] Add success.
I0320 05:14:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:14:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:14:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:14:53.409828 543705 cpu.go:275] no items to output this cycle
E0320 05:14:53.409888 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:14:53.409903 543705 memory.go:184] no items to output this cycle
E0320 05:15:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:15:03.409783 543705 memory.go:184] no items to output this cycle
I0320 05:15:03.409808 543705 cpu.go:275] no items to output this cycle
E0320 05:15:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:15:13.409795 543705 memory.go:191] Add success.
I0320 05:15:13.409795 543705 cpu.go:282] Add success.
W0320 05:15:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:15:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:15:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:15:13.420127 543705 net.go:648] Add success.
I0320 05:15:13.422713 543705 net.go:770] primary dev: ETH0
I0320 05:15:13.422726 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:15:13.422738 543705 net.go:698] Add success.
I0320 05:15:13.468139 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"64f3f2f6-7b8d-4c47-a551-f433dcafc3e4","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 05:15:13.468182 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 05:15:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:15:14.455104 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:15:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0320 05:15:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:15:14.456537 543705 disk_worker.go:494] system disk:vda1
I0320 05:15:14.456590 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:15:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:15:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:15:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:15:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:15:16.472484 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:15:22.719227 543705 disk_info.go:125] begin check local disk info of client
I0320 05:15:22.721816 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:15:22.721823 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a91c0 0xc0004a9200]
E0320 05:15:23.409314 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:15:23.409328 543705 memory.go:184] no items to output this cycle
I0320 05:15:23.409363 543705 cpu.go:275] no items to output this cycle
E0320 05:15:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:15:33.409791 543705 cpu.go:275] no items to output this cycle
I0320 05:15:33.409798 543705 memory.go:184] no items to output this cycle
I0320 05:15:38.145732 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 05:15:38.145739 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 05:15:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:15:43.410594 543705 memory.go:191] Add success.
I0320 05:15:43.409827 543705 cpu.go:282] Add success.
I0320 05:15:43.420366 543705 net.go:648] Add success.
I0320 05:15:43.422924 543705 net.go:770] primary dev: ETH0
I0320 05:15:43.422938 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:15:43.422954 543705 net.go:698] Add success.
I0320 05:15:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:15:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:15:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:15:53.409837 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:15:53.409856 543705 memory.go:184] no items to output this cycle
I0320 05:15:53.409919 543705 cpu.go:275] no items to output this cycle
E0320 05:16:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:16:03.409776 543705 memory.go:184] no items to output this cycle
I0320 05:16:03.409819 543705 cpu.go:275] no items to output this cycle
E0320 05:16:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:16:13.409787 543705 memory.go:191] Add success.
I0320 05:16:13.409810 543705 cpu.go:282] Add success.
W0320 05:16:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:16:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:16:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:16:13.420169 543705 net.go:648] Add success.
I0320 05:16:13.422920 543705 net.go:770] primary dev: ETH0
I0320 05:16:13.422935 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:16:13.422948 543705 net.go:698] Add success.
I0320 05:16:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:16:14.455104 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:16:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0320 05:16:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:16:14.456510 543705 disk_worker.go:494] system disk:vda1
I0320 05:16:14.456553 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:16:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:16:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:16:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:16:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:16:16.472482 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:16:22.723264 543705 disk_info.go:125] begin check local disk info of client
I0320 05:16:22.725828 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:16:22.725833 543705 disk_info.go:196] parse disk info done, disk is : [0xc000377e80 0xc000377ec0]
E0320 05:16:23.407518 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:16:23.407532 543705 memory.go:184] no items to output this cycle
I0320 05:16:23.407551 543705 cpu.go:275] no items to output this cycle
E0320 05:16:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:16:33.409772 543705 memory.go:184] no items to output this cycle
I0320 05:16:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 05:16:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:16:43.409812 543705 memory.go:191] Add success.
I0320 05:16:43.409824 543705 cpu.go:282] Add success.
I0320 05:16:43.419962 543705 net.go:648] Add success.
I0320 05:16:43.422926 543705 net.go:770] primary dev: ETH0
I0320 05:16:43.423024 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:16:43.423039 543705 net.go:698] Add success.
I0320 05:16:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:16:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:16:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:16:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:16:53.409771 543705 memory.go:184] no items to output this cycle
I0320 05:16:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 05:17:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:17:03.409803 543705 memory.go:184] no items to output this cycle
I0320 05:17:03.409815 543705 cpu.go:275] no items to output this cycle
E0320 05:17:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:17:13.409780 543705 memory.go:191] Add success.
I0320 05:17:13.409798 543705 cpu.go:282] Add success.
W0320 05:17:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:17:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:17:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:17:13.420065 543705 net.go:648] Add success.
I0320 05:17:13.422661 543705 net.go:770] primary dev: ETH0
I0320 05:17:13.422675 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:17:13.422686 543705 net.go:698] Add success.
I0320 05:17:13.453222 543705 event_worker.go:152] Polling the log file for events...
W0320 05:17:14.455103 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:17:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0320 05:17:14.455168 543705 disk_worker.go:728] disk inode is not compliant
E0320 05:17:14.456898 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 05:17:14.456908 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 05:17:14.456914 543705 custom_config.go:64] query custom config with name: gpu
I0320 05:17:14.456965 543705 disk_worker.go:494] system disk:vda1
I0320 05:17:14.457008 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 05:17:15.456846 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 05:17:15.456855 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:17:16.457925 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 05:17:16.457924 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 05:17:16.457979 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:17:16.457999 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:17:16.472314 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:17:22.725922 543705 disk_info.go:125] begin check local disk info of client
I0320 05:17:22.728485 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:17:22.728493 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039a040 0xc00039a080]
E0320 05:17:23.407882 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:17:23.407902 543705 memory.go:184] no items to output this cycle
I0320 05:17:23.407916 543705 cpu.go:275] no items to output this cycle
E0320 05:17:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:17:33.409769 543705 memory.go:184] no items to output this cycle
I0320 05:17:33.409790 543705 cpu.go:275] no items to output this cycle
E0320 05:17:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:17:43.409782 543705 memory.go:191] Add success.
I0320 05:17:43.409805 543705 cpu.go:282] Add success.
I0320 05:17:43.419998 543705 net.go:648] Add success.
I0320 05:17:43.422776 543705 net.go:770] primary dev: ETH0
I0320 05:17:43.422791 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:17:43.422805 543705 net.go:698] Add success.
I0320 05:17:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:17:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:17:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:17:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:17:53.409771 543705 memory.go:184] no items to output this cycle
I0320 05:17:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 05:18:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:18:03.409808 543705 memory.go:184] no items to output this cycle
I0320 05:18:03.409825 543705 cpu.go:275] no items to output this cycle
E0320 05:18:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:18:13.409812 543705 memory.go:191] Add success.
I0320 05:18:13.409820 543705 cpu.go:282] Add success.
W0320 05:18:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:18:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:18:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:18:13.420140 543705 net.go:648] Add success.
I0320 05:18:13.422762 543705 net.go:770] primary dev: ETH0
I0320 05:18:13.422776 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:18:13.422787 543705 net.go:698] Add success.
I0320 05:18:13.467952 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"139feac5-5c5c-477a-91c7-858365d3f723","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 05:18:13.467986 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 05:18:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:18:14.455200 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:18:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0320 05:18:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:18:14.456696 543705 disk_worker.go:494] system disk:vda1
I0320 05:18:14.456728 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:18:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:18:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:18:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:18:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:18:16.472380 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:18:22.728575 543705 disk_info.go:125] begin check local disk info of client
I0320 05:18:22.731104 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:18:22.731111 543705 disk_info.go:196] parse disk info done, disk is : [0xc000376480 0xc0003764c0]
E0320 05:18:23.409573 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:18:23.409574 543705 cpu.go:275] no items to output this cycle
I0320 05:18:23.409585 543705 memory.go:184] no items to output this cycle
E0320 05:18:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:18:33.409808 543705 memory.go:184] no items to output this cycle
I0320 05:18:33.409819 543705 cpu.go:275] no items to output this cycle
I0320 05:18:38.145883 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 05:18:38.145890 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 05:18:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:18:43.410771 543705 memory.go:191] Add success.
I0320 05:18:43.409906 543705 cpu.go:282] Add success.
I0320 05:18:43.419746 543705 net.go:648] Add success.
I0320 05:18:43.422291 543705 net.go:770] primary dev: ETH0
I0320 05:18:43.422304 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:18:43.422316 543705 net.go:698] Add success.
I0320 05:18:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:18:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:18:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:18:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:18:53.409768 543705 memory.go:184] no items to output this cycle
I0320 05:18:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 05:19:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:19:03.409781 543705 memory.go:184] no items to output this cycle
I0320 05:19:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 05:19:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:19:13.409808 543705 memory.go:191] Add success.
I0320 05:19:13.409817 543705 cpu.go:282] Add success.
W0320 05:19:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:19:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:19:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:19:13.420147 543705 net.go:648] Add success.
I0320 05:19:13.423053 543705 net.go:770] primary dev: ETH0
I0320 05:19:13.423066 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:19:13.423078 543705 net.go:698] Add success.
I0320 05:19:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:19:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:19:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0320 05:19:14.455170 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:19:14.456548 543705 disk_worker.go:494] system disk:vda1
I0320 05:19:14.456596 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:19:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:19:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:19:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:19:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:19:16.472372 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:19:22.732300 543705 disk_info.go:125] begin check local disk info of client
I0320 05:19:22.734903 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:19:22.734911 543705 disk_info.go:196] parse disk info done, disk is : [0xc000376280 0xc0003762c0]
E0320 05:19:23.409327 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:19:23.409340 543705 memory.go:184] no items to output this cycle
I0320 05:19:23.409379 543705 cpu.go:275] no items to output this cycle
E0320 05:19:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:19:33.409770 543705 memory.go:184] no items to output this cycle
I0320 05:19:33.409799 543705 cpu.go:275] no items to output this cycle
E0320 05:19:43.409878 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:19:43.409909 543705 memory.go:191] Add success.
I0320 05:19:43.409974 543705 cpu.go:282] Add success.
I0320 05:19:43.419725 543705 net.go:648] Add success.
I0320 05:19:43.422654 543705 net.go:770] primary dev: ETH0
I0320 05:19:43.422667 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:19:43.422678 543705 net.go:698] Add success.
I0320 05:19:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:19:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:19:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:19:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:19:53.409783 543705 memory.go:184] no items to output this cycle
I0320 05:19:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 05:20:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:20:03.409777 543705 memory.go:184] no items to output this cycle
I0320 05:20:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 05:20:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:20:13.409779 543705 memory.go:191] Add success.
W0320 05:20:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:20:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:20:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:20:13.409829 543705 cpu.go:282] Add success.
I0320 05:20:13.420078 543705 net.go:648] Add success.
I0320 05:20:13.423011 543705 net.go:770] primary dev: ETH0
I0320 05:20:13.423029 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:20:13.423043 543705 net.go:698] Add success.
I0320 05:20:14.454953 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:20:14.455194 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:20:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0320 05:20:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:20:14.456566 543705 disk_worker.go:494] system disk:vda1
I0320 05:20:14.456595 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:20:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:20:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:20:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:20:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:20:16.472372 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:20:22.736320 543705 disk_info.go:125] begin check local disk info of client
I0320 05:20:22.738860 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:20:22.738867 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a9240 0xc0004a9280]
E0320 05:20:23.409281 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:20:23.409296 543705 memory.go:184] no items to output this cycle
I0320 05:20:23.409306 543705 cpu.go:275] no items to output this cycle
I0320 05:20:33.409923 543705 cpu.go:275] no items to output this cycle
E0320 05:20:33.409927 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:20:33.409946 543705 memory.go:184] no items to output this cycle
E0320 05:20:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:20:43.409819 543705 memory.go:191] Add success.
I0320 05:20:43.409828 543705 cpu.go:282] Add success.
I0320 05:20:43.419962 543705 net.go:648] Add success.
I0320 05:20:43.422853 543705 net.go:770] primary dev: ETH0
I0320 05:20:43.422868 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:20:43.422882 543705 net.go:698] Add success.
I0320 05:20:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:20:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:20:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:20:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:20:53.409779 543705 memory.go:184] no items to output this cycle
I0320 05:20:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 05:21:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:21:03.409791 543705 memory.go:184] no items to output this cycle
I0320 05:21:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 05:21:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:21:13.409783 543705 memory.go:191] Add success.
I0320 05:21:13.409783 543705 cpu.go:282] Add success.
W0320 05:21:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:21:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:21:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:21:13.420057 543705 net.go:648] Add success.
I0320 05:21:13.422995 543705 net.go:770] primary dev: ETH0
I0320 05:21:13.423010 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:21:13.423023 543705 net.go:698] Add success.
I0320 05:21:13.575413 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e8733d17-a07f-4414-9b5b-13ba6f0b5dc5","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 05:21:13.575449 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 05:21:14.453972 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:21:14.455244 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:21:14.455254 543705 disk_worker.go:708] disk space is not compliant
W0320 05:21:14.455257 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:21:14.456624 543705 disk_worker.go:494] system disk:vda1
I0320 05:21:14.456679 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:21:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:21:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:21:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:21:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:21:16.472424 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:21:22.740336 543705 disk_info.go:125] begin check local disk info of client
I0320 05:21:22.742876 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:21:22.742882 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001abb00 0xc0001abb40]
E0320 05:21:23.409377 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:21:23.409392 543705 memory.go:184] no items to output this cycle
I0320 05:21:23.409488 543705 cpu.go:275] no items to output this cycle
E0320 05:21:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:21:33.409778 543705 memory.go:184] no items to output this cycle
I0320 05:21:33.409791 543705 cpu.go:275] no items to output this cycle
I0320 05:21:38.149435 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 05:21:38.149441 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 05:21:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:21:43.410735 543705 memory.go:191] Add success.
I0320 05:21:43.409805 543705 cpu.go:282] Add success.
I0320 05:21:43.420478 543705 net.go:648] Add success.
I0320 05:21:43.423375 543705 net.go:770] primary dev: ETH0
I0320 05:21:43.423388 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:21:43.423400 543705 net.go:698] Add success.
I0320 05:21:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:21:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:21:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:21:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:21:53.409780 543705 memory.go:184] no items to output this cycle
I0320 05:21:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 05:22:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:22:03.409806 543705 memory.go:184] no items to output this cycle
I0320 05:22:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 05:22:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:22:13.409784 543705 memory.go:191] Add success.
I0320 05:22:13.409803 543705 cpu.go:282] Add success.
W0320 05:22:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:22:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:22:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:22:13.420094 543705 net.go:648] Add success.
I0320 05:22:13.423030 543705 net.go:770] primary dev: ETH0
I0320 05:22:13.423043 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:22:13.423056 543705 net.go:698] Add success.
W0320 05:22:14.455187 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:22:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 05:22:14.455201 543705 disk_worker.go:728] disk inode is not compliant
E0320 05:22:14.455909 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 05:22:14.455918 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 05:22:14.455924 543705 custom_config.go:64] query custom config with name: gpu
I0320 05:22:14.456571 543705 disk_worker.go:494] system disk:vda1
I0320 05:22:14.456601 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 05:22:15.456833 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 05:22:15.456841 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:22:16.457954 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 05:22:16.457962 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 05:22:16.458006 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:22:16.458023 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:22:16.472345 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:22:22.744352 543705 disk_info.go:125] begin check local disk info of client
I0320 05:22:22.746805 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:22:22.746811 543705 disk_info.go:196] parse disk info done, disk is : [0xc00046de00 0xc00046de40]
E0320 05:22:23.407630 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:22:23.407645 543705 memory.go:184] no items to output this cycle
I0320 05:22:23.407684 543705 cpu.go:275] no items to output this cycle
E0320 05:22:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:22:33.409808 543705 memory.go:184] no items to output this cycle
I0320 05:22:33.409817 543705 cpu.go:275] no items to output this cycle
E0320 05:22:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:22:43.409818 543705 memory.go:191] Add success.
I0320 05:22:43.409826 543705 cpu.go:282] Add success.
I0320 05:22:43.420049 543705 net.go:648] Add success.
I0320 05:22:43.422787 543705 net.go:770] primary dev: ETH0
I0320 05:22:43.422800 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:22:43.422813 543705 net.go:698] Add success.
I0320 05:22:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:22:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:22:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:22:53.410275 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:22:53.410292 543705 memory.go:184] no items to output this cycle
I0320 05:22:53.410299 543705 cpu.go:275] no items to output this cycle
E0320 05:23:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:23:03.409790 543705 memory.go:184] no items to output this cycle
I0320 05:23:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 05:23:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:23:13.409799 543705 cpu.go:282] Add success.
I0320 05:23:13.409820 543705 memory.go:191] Add success.
W0320 05:23:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:23:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:23:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:23:13.420183 543705 net.go:648] Add success.
I0320 05:23:13.421060 543705 net.go:770] primary dev: ETH0
I0320 05:23:13.421073 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:23:13.421084 543705 net.go:698] Add success.
I0320 05:23:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:23:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:23:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 05:23:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:23:14.456626 543705 disk_worker.go:494] system disk:vda1
I0320 05:23:14.456655 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:23:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:23:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:23:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:23:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:23:16.472399 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:23:22.746901 543705 disk_info.go:125] begin check local disk info of client
I0320 05:23:22.749447 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:23:22.749453 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002718c0 0xc000271900]
E0320 05:23:23.407534 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:23:23.407551 543705 memory.go:184] no items to output this cycle
I0320 05:23:23.407565 543705 cpu.go:275] no items to output this cycle
E0320 05:23:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:23:33.409792 543705 memory.go:184] no items to output this cycle
I0320 05:23:33.409799 543705 cpu.go:275] no items to output this cycle
E0320 05:23:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:23:43.409788 543705 memory.go:191] Add success.
I0320 05:23:43.409810 543705 cpu.go:282] Add success.
I0320 05:23:43.419956 543705 net.go:648] Add success.
I0320 05:23:43.422690 543705 net.go:770] primary dev: ETH0
I0320 05:23:43.422703 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:23:43.422715 543705 net.go:698] Add success.
I0320 05:23:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:23:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:23:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:23:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:23:53.409776 543705 memory.go:184] no items to output this cycle
I0320 05:23:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 05:24:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:24:03.409791 543705 cpu.go:275] no items to output this cycle
I0320 05:24:03.409802 543705 memory.go:184] no items to output this cycle
E0320 05:24:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:24:13.409814 543705 memory.go:191] Add success.
I0320 05:24:13.409827 543705 cpu.go:282] Add success.
W0320 05:24:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:24:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:24:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:24:13.420131 543705 net.go:648] Add success.
I0320 05:24:13.422527 543705 net.go:770] primary dev: ETH0
I0320 05:24:13.422540 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:24:13.422551 543705 net.go:698] Add success.
I0320 05:24:13.468308 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"582e5053-326d-4947-a85b-c4fc8f84c725","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 05:24:13.468343 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 05:24:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:24:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:24:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 05:24:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:24:14.456685 543705 disk_worker.go:494] system disk:vda1
I0320 05:24:14.456729 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:24:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:24:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:24:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:24:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:24:16.472399 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:24:22.749674 543705 disk_info.go:125] begin check local disk info of client
I0320 05:24:22.752129 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:24:22.752135 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003850c0 0xc000385100]
E0320 05:24:23.409497 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:24:23.409514 543705 memory.go:184] no items to output this cycle
I0320 05:24:23.409529 543705 cpu.go:275] no items to output this cycle
E0320 05:24:33.409860 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:24:33.409876 543705 memory.go:184] no items to output this cycle
I0320 05:24:33.409877 543705 cpu.go:275] no items to output this cycle
I0320 05:24:38.149735 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 05:24:38.149742 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 05:24:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:24:43.410875 543705 memory.go:191] Add success.
I0320 05:24:43.409823 543705 cpu.go:282] Add success.
I0320 05:24:43.419863 543705 net.go:648] Add success.
I0320 05:24:43.422236 543705 net.go:770] primary dev: ETH0
I0320 05:24:43.422248 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:24:43.422261 543705 net.go:698] Add success.
I0320 05:24:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:24:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:24:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:24:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:24:53.409781 543705 memory.go:184] no items to output this cycle
I0320 05:24:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 05:25:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:25:03.409785 543705 memory.go:184] no items to output this cycle
I0320 05:25:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 05:25:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:25:13.409814 543705 memory.go:191] Add success.
I0320 05:25:13.409830 543705 cpu.go:282] Add success.
W0320 05:25:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:25:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:25:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:25:13.420110 543705 net.go:648] Add success.
I0320 05:25:13.422594 543705 net.go:770] primary dev: ETH0
I0320 05:25:13.422608 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:25:13.422621 543705 net.go:698] Add success.
I0320 05:25:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:25:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:25:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 05:25:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:25:14.456591 543705 disk_worker.go:494] system disk:vda1
I0320 05:25:14.456623 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:25:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:25:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:25:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:25:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:25:16.472372 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:25:22.753422 543705 disk_info.go:125] begin check local disk info of client
I0320 05:25:22.756040 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:25:22.756047 543705 disk_info.go:196] parse disk info done, disk is : [0xc00054a700 0xc00054a740]
E0320 05:25:23.409391 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:25:23.409406 543705 memory.go:184] no items to output this cycle
I0320 05:25:23.409443 543705 cpu.go:275] no items to output this cycle
E0320 05:25:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:25:33.409788 543705 memory.go:184] no items to output this cycle
I0320 05:25:33.409798 543705 cpu.go:275] no items to output this cycle
E0320 05:25:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:25:43.409825 543705 memory.go:191] Add success.
I0320 05:25:43.409829 543705 cpu.go:282] Add success.
I0320 05:25:43.419963 543705 net.go:648] Add success.
I0320 05:25:43.422515 543705 net.go:770] primary dev: ETH0
I0320 05:25:43.422527 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:25:43.422539 543705 net.go:698] Add success.
I0320 05:25:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:25:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:25:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:25:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:25:53.409777 543705 memory.go:184] no items to output this cycle
I0320 05:25:53.409780 543705 cpu.go:275] no items to output this cycle
E0320 05:26:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:26:03.409790 543705 memory.go:184] no items to output this cycle
I0320 05:26:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 05:26:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:26:13.409786 543705 memory.go:191] Add success.
I0320 05:26:13.409788 543705 cpu.go:282] Add success.
W0320 05:26:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:26:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:26:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:26:13.420091 543705 net.go:648] Add success.
I0320 05:26:13.422585 543705 net.go:770] primary dev: ETH0
I0320 05:26:13.422599 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:26:13.422613 543705 net.go:698] Add success.
I0320 05:26:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:26:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:26:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 05:26:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:26:14.456577 543705 disk_worker.go:494] system disk:vda1
I0320 05:26:14.456608 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:26:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:26:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:26:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:26:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:26:16.472401 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:26:22.756131 543705 disk_info.go:125] begin check local disk info of client
I0320 05:26:22.758612 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:26:22.758618 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a6fc0 0xc0002a7000]
E0320 05:26:23.407881 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:26:23.407899 543705 memory.go:184] no items to output this cycle
I0320 05:26:23.407916 543705 cpu.go:275] no items to output this cycle
E0320 05:26:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:26:33.409780 543705 memory.go:184] no items to output this cycle
I0320 05:26:33.409802 543705 cpu.go:275] no items to output this cycle
E0320 05:26:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:26:43.409826 543705 memory.go:191] Add success.
I0320 05:26:43.409833 543705 cpu.go:282] Add success.
I0320 05:26:43.420156 543705 net.go:648] Add success.
I0320 05:26:43.422628 543705 net.go:770] primary dev: ETH0
I0320 05:26:43.422642 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:26:43.422654 543705 net.go:698] Add success.
I0320 05:26:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:26:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:26:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:26:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:26:53.409774 543705 memory.go:184] no items to output this cycle
I0320 05:26:53.409783 543705 cpu.go:275] no items to output this cycle
E0320 05:27:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:27:03.409778 543705 memory.go:184] no items to output this cycle
I0320 05:27:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 05:27:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:27:13.409789 543705 memory.go:191] Add success.
I0320 05:27:13.409807 543705 cpu.go:282] Add success.
W0320 05:27:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:27:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:27:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:27:13.420040 543705 net.go:648] Add success.
I0320 05:27:13.422584 543705 net.go:770] primary dev: ETH0
I0320 05:27:13.422597 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:27:13.422608 543705 net.go:698] Add success.
I0320 05:27:13.428608 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 05:27:13.452772 543705 event_worker.go:152] Polling the log file for events...
I0320 05:27:13.464047 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f48ecf93-e6d0-458f-94b9-a8acd848692a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 05:27:13.464080 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 05:27:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:27:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 05:27:14.455190 543705 disk_worker.go:728] disk inode is not compliant
E0320 05:27:14.455887 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 05:27:14.455897 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 05:27:14.455902 543705 custom_config.go:64] query custom config with name: gpu
I0320 05:27:14.456668 543705 disk_worker.go:494] system disk:vda1
I0320 05:27:14.456704 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 05:27:15.456828 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 05:27:15.456836 543705 custom_config.go:64] query custom config with name: huawei_npu
E0320 05:27:16.457925 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 05:27:16.457926 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:27:16.457982 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:27:16.458001 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:27:16.472325 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:27:22.759428 543705 disk_info.go:125] begin check local disk info of client
I0320 05:27:22.762061 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:27:22.762068 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002712c0 0xc000271300]
E0320 05:27:23.407515 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:27:23.407529 543705 memory.go:184] no items to output this cycle
I0320 05:27:23.407550 543705 cpu.go:275] no items to output this cycle
E0320 05:27:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:27:33.409797 543705 memory.go:184] no items to output this cycle
I0320 05:27:33.409807 543705 cpu.go:275] no items to output this cycle
I0320 05:27:38.153478 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 05:27:38.153484 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 05:27:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:27:43.410608 543705 memory.go:191] Add success.
I0320 05:27:43.409829 543705 cpu.go:282] Add success.
I0320 05:27:43.420333 543705 net.go:648] Add success.
I0320 05:27:43.422694 543705 net.go:770] primary dev: ETH0
I0320 05:27:43.422710 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:27:43.422725 543705 net.go:698] Add success.
I0320 05:27:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:27:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:27:46.458090 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:27:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:27:53.409773 543705 memory.go:184] no items to output this cycle
I0320 05:27:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 05:28:03.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:28:03.409814 543705 memory.go:184] no items to output this cycle
I0320 05:28:03.409823 543705 cpu.go:275] no items to output this cycle
E0320 05:28:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:28:13.409782 543705 memory.go:191] Add success.
I0320 05:28:13.409804 543705 cpu.go:282] Add success.
W0320 05:28:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:28:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:28:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:28:13.420133 543705 net.go:648] Add success.
I0320 05:28:13.423326 543705 net.go:770] primary dev: ETH0
I0320 05:28:13.423354 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:28:13.423368 543705 net.go:698] Add success.
I0320 05:28:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:28:14.455191 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:28:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 05:28:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:28:14.456570 543705 disk_worker.go:494] system disk:vda1
I0320 05:28:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:28:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:28:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:28:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:28:16.458046 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:28:16.472401 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:28:22.763455 543705 disk_info.go:125] begin check local disk info of client
I0320 05:28:22.765919 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:28:22.765925 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5580 0xc0000c55c0]
E0320 05:28:23.409211 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:28:23.409225 543705 memory.go:184] no items to output this cycle
I0320 05:28:23.409232 543705 cpu.go:275] no items to output this cycle
E0320 05:28:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:28:33.409789 543705 memory.go:184] no items to output this cycle
I0320 05:28:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 05:28:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:28:43.409810 543705 memory.go:191] Add success.
I0320 05:28:43.409824 543705 cpu.go:282] Add success.
I0320 05:28:43.419981 543705 net.go:648] Add success.
I0320 05:28:43.422693 543705 net.go:770] primary dev: ETH0
I0320 05:28:43.422706 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:28:43.422718 543705 net.go:698] Add success.
I0320 05:28:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:28:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:28:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:28:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:28:53.409783 543705 cpu.go:275] no items to output this cycle
I0320 05:28:53.409795 543705 memory.go:184] no items to output this cycle
E0320 05:29:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:29:03.409793 543705 memory.go:184] no items to output this cycle
I0320 05:29:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 05:29:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:29:13.409799 543705 cpu.go:282] Add success.
I0320 05:29:13.409807 543705 memory.go:191] Add success.
W0320 05:29:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:29:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:29:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:29:13.420120 543705 net.go:648] Add success.
I0320 05:29:13.422630 543705 net.go:770] primary dev: ETH0
I0320 05:29:13.422643 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:29:13.422655 543705 net.go:698] Add success.
I0320 05:29:14.454951 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:29:14.455099 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:29:14.455159 543705 disk_worker.go:708] disk space is not compliant
W0320 05:29:14.455162 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:29:14.456492 543705 disk_worker.go:494] system disk:vda1
I0320 05:29:14.456538 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:29:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:29:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:29:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:29:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:29:16.472386 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:29:22.766012 543705 disk_info.go:125] begin check local disk info of client
I0320 05:29:22.768656 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:29:22.768663 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000eda40 0xc0000eda80]
E0320 05:29:23.407865 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:29:23.407881 543705 memory.go:184] no items to output this cycle
I0320 05:29:23.407911 543705 cpu.go:275] no items to output this cycle
E0320 05:29:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:29:33.409781 543705 memory.go:184] no items to output this cycle
I0320 05:29:33.409804 543705 cpu.go:275] no items to output this cycle
E0320 05:29:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:29:43.409797 543705 memory.go:191] Add success.
I0320 05:29:43.409817 543705 cpu.go:282] Add success.
I0320 05:29:43.419959 543705 net.go:648] Add success.
I0320 05:29:43.422606 543705 net.go:770] primary dev: ETH0
I0320 05:29:43.422621 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:29:43.422636 543705 net.go:698] Add success.
I0320 05:29:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:29:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:29:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:29:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:29:53.409808 543705 memory.go:184] no items to output this cycle
I0320 05:29:53.409818 543705 cpu.go:275] no items to output this cycle
E0320 05:30:03.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:30:03.409814 543705 memory.go:184] no items to output this cycle
I0320 05:30:03.409827 543705 cpu.go:275] no items to output this cycle
E0320 05:30:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:30:13.409821 543705 memory.go:191] Add success.
I0320 05:30:13.409831 543705 cpu.go:282] Add success.
W0320 05:30:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:30:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:30:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:30:13.420206 543705 net.go:648] Add success.
I0320 05:30:13.422714 543705 net.go:770] primary dev: ETH0
I0320 05:30:13.422730 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:30:13.422745 543705 net.go:698] Add success.
I0320 05:30:13.470360 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"73028b81-61a4-4ff5-b4bb-6848ab9557bc","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 05:30:13.470392 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 05:30:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:30:14.455144 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:30:14.455225 543705 disk_worker.go:708] disk space is not compliant
W0320 05:30:14.455228 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:30:14.456743 543705 disk_worker.go:494] system disk:vda1
I0320 05:30:14.456774 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:30:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:30:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:30:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:30:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:30:16.472383 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:30:22.769672 543705 disk_info.go:125] begin check local disk info of client
I0320 05:30:22.772179 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:30:22.772185 543705 disk_info.go:196] parse disk info done, disk is : [0xc00036ad40 0xc00036ad80]
E0320 05:30:23.409436 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:30:23.409448 543705 memory.go:184] no items to output this cycle
I0320 05:30:23.409451 543705 cpu.go:275] no items to output this cycle
E0320 05:30:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:30:33.409787 543705 cpu.go:275] no items to output this cycle
I0320 05:30:33.409793 543705 memory.go:184] no items to output this cycle
I0320 05:30:38.153741 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 05:30:38.153748 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 05:30:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:30:43.410687 543705 memory.go:191] Add success.
I0320 05:30:43.409803 543705 cpu.go:282] Add success.
I0320 05:30:43.420426 543705 net.go:648] Add success.
I0320 05:30:43.423230 543705 net.go:770] primary dev: ETH0
I0320 05:30:43.423245 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:30:43.423261 543705 net.go:698] Add success.
I0320 05:30:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:30:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:30:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:30:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:30:53.409784 543705 memory.go:184] no items to output this cycle
I0320 05:30:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 05:31:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:31:03.409789 543705 memory.go:184] no items to output this cycle
I0320 05:31:03.409802 543705 cpu.go:275] no items to output this cycle
E0320 05:31:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:31:13.409786 543705 memory.go:191] Add success.
W0320 05:31:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 05:31:13.409815 543705 cpu.go:282] Add success.
W0320 05:31:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:31:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:31:13.420127 543705 net.go:648] Add success.
I0320 05:31:13.422791 543705 net.go:770] primary dev: ETH0
I0320 05:31:13.422804 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:31:13.422816 543705 net.go:698] Add success.
I0320 05:31:14.454982 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:31:14.455199 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:31:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0320 05:31:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:31:14.456608 543705 disk_worker.go:494] system disk:vda1
I0320 05:31:14.456637 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:31:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:31:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:31:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:31:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:31:16.472380 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:31:22.773502 543705 disk_info.go:125] begin check local disk info of client
I0320 05:31:22.776046 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:31:22.776053 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000ed300 0xc0000ed340]
E0320 05:31:23.409302 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:31:23.409319 543705 memory.go:184] no items to output this cycle
I0320 05:31:23.409338 543705 cpu.go:275] no items to output this cycle
I0320 05:31:33.409865 543705 cpu.go:275] no items to output this cycle
E0320 05:31:33.409866 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:31:33.409887 543705 memory.go:184] no items to output this cycle
E0320 05:31:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:31:43.409790 543705 memory.go:191] Add success.
I0320 05:31:43.409809 543705 cpu.go:282] Add success.
I0320 05:31:43.419899 543705 net.go:648] Add success.
I0320 05:31:43.422483 543705 net.go:770] primary dev: ETH0
I0320 05:31:43.422498 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:31:43.422513 543705 net.go:698] Add success.
I0320 05:31:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:31:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:31:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:31:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:31:53.409772 543705 memory.go:184] no items to output this cycle
I0320 05:31:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 05:32:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:32:03.409808 543705 memory.go:184] no items to output this cycle
I0320 05:32:03.409819 543705 cpu.go:275] no items to output this cycle
E0320 05:32:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:32:13.409779 543705 memory.go:191] Add success.
W0320 05:32:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 05:32:13.409806 543705 cpu.go:282] Add success.
W0320 05:32:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:32:13.409818 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:32:13.420071 543705 net.go:648] Add success.
I0320 05:32:13.423137 543705 net.go:770] primary dev: ETH0
I0320 05:32:13.423151 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:32:13.423162 543705 net.go:698] Add success.
W0320 05:32:14.455141 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:32:14.455154 543705 disk_worker.go:708] disk space is not compliant
W0320 05:32:14.455158 543705 disk_worker.go:728] disk inode is not compliant
E0320 05:32:14.456901 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 05:32:14.456910 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 05:32:14.456916 543705 custom_config.go:64] query custom config with name: gpu
I0320 05:32:14.457011 543705 disk_worker.go:494] system disk:vda1
I0320 05:32:14.457052 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 05:32:15.456838 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 05:32:15.456847 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:32:16.457938 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 05:32:16.457947 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 05:32:16.458002 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:32:16.458019 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:32:16.472328 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:32:22.777517 543705 disk_info.go:125] begin check local disk info of client
I0320 05:32:22.779978 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:32:22.779984 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aad00 0xc0001aad40]
E0320 05:32:23.409199 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:32:23.409216 543705 memory.go:184] no items to output this cycle
I0320 05:32:23.409231 543705 cpu.go:275] no items to output this cycle
E0320 05:32:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:32:33.409779 543705 memory.go:184] no items to output this cycle
I0320 05:32:33.409800 543705 cpu.go:275] no items to output this cycle
E0320 05:32:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:32:43.409805 543705 memory.go:191] Add success.
I0320 05:32:43.409806 543705 cpu.go:282] Add success.
I0320 05:32:43.420103 543705 net.go:648] Add success.
I0320 05:32:43.422801 543705 net.go:770] primary dev: ETH0
I0320 05:32:43.422815 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:32:43.422829 543705 net.go:698] Add success.
I0320 05:32:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:32:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:32:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:32:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:32:53.409793 543705 memory.go:184] no items to output this cycle
I0320 05:32:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 05:33:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:33:03.409785 543705 memory.go:184] no items to output this cycle
I0320 05:33:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 05:33:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:33:13.409788 543705 memory.go:191] Add success.
I0320 05:33:13.409806 543705 cpu.go:282] Add success.
W0320 05:33:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:33:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:33:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:33:13.420124 543705 net.go:648] Add success.
I0320 05:33:13.423321 543705 net.go:770] primary dev: ETH0
I0320 05:33:13.423333 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:33:13.423345 543705 net.go:698] Add success.
I0320 05:33:13.468454 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ea7afdc7-2ca7-46b5-817d-1d959fc9a056","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 05:33:13.468488 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 05:33:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:33:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:33:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 05:33:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:33:14.456695 543705 disk_worker.go:494] system disk:vda1
I0320 05:33:14.456731 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:33:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:33:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:33:16.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:33:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:33:16.472398 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:33:22.781551 543705 disk_info.go:125] begin check local disk info of client
I0320 05:33:22.784168 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:33:22.784175 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000dcc00 0xc0000dcc40]
E0320 05:33:23.409375 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:33:23.409394 543705 memory.go:184] no items to output this cycle
I0320 05:33:23.409405 543705 cpu.go:275] no items to output this cycle
E0320 05:33:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:33:33.409784 543705 memory.go:184] no items to output this cycle
I0320 05:33:33.409790 543705 cpu.go:275] no items to output this cycle
I0320 05:33:38.157482 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 05:33:38.157490 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 05:33:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:33:43.410725 543705 memory.go:191] Add success.
I0320 05:33:43.409827 543705 cpu.go:282] Add success.
I0320 05:33:43.420413 543705 net.go:648] Add success.
I0320 05:33:43.423009 543705 net.go:770] primary dev: ETH0
I0320 05:33:43.423022 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:33:43.423034 543705 net.go:698] Add success.
I0320 05:33:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:33:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:33:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:33:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:33:53.409775 543705 memory.go:184] no items to output this cycle
I0320 05:33:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 05:34:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:34:03.409786 543705 memory.go:184] no items to output this cycle
I0320 05:34:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 05:34:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:34:13.409820 543705 memory.go:191] Add success.
I0320 05:34:13.409823 543705 cpu.go:282] Add success.
W0320 05:34:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:34:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:34:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:34:13.420096 543705 net.go:648] Add success.
I0320 05:34:13.422772 543705 net.go:770] primary dev: ETH0
I0320 05:34:13.422785 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:34:13.422796 543705 net.go:698] Add success.
I0320 05:34:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:34:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:34:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 05:34:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:34:14.456579 543705 disk_worker.go:494] system disk:vda1
I0320 05:34:14.456608 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:34:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:34:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:34:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:34:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:34:16.472390 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:34:22.785571 543705 disk_info.go:125] begin check local disk info of client
I0320 05:34:22.788030 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:34:22.788037 543705 disk_info.go:196] parse disk info done, disk is : [0xc000305240 0xc000305280]
E0320 05:34:23.407524 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:34:23.407535 543705 memory.go:184] no items to output this cycle
I0320 05:34:23.407535 543705 cpu.go:275] no items to output this cycle
E0320 05:34:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:34:33.409803 543705 memory.go:184] no items to output this cycle
I0320 05:34:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 05:34:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:34:43.409794 543705 memory.go:191] Add success.
I0320 05:34:43.409814 543705 cpu.go:282] Add success.
I0320 05:34:43.419955 543705 net.go:648] Add success.
I0320 05:34:43.422941 543705 net.go:770] primary dev: ETH0
I0320 05:34:43.422954 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:34:43.422967 543705 net.go:698] Add success.
I0320 05:34:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:34:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:34:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:34:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:34:53.409765 543705 memory.go:184] no items to output this cycle
I0320 05:34:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 05:35:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:35:03.409810 543705 memory.go:184] no items to output this cycle
I0320 05:35:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 05:35:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:35:13.409780 543705 memory.go:191] Add success.
I0320 05:35:13.409799 543705 cpu.go:282] Add success.
W0320 05:35:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:35:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:35:13.409819 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:35:13.420247 543705 net.go:648] Add success.
I0320 05:35:13.422793 543705 net.go:770] primary dev: ETH0
I0320 05:35:13.422806 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:35:13.422818 543705 net.go:698] Add success.
I0320 05:35:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:35:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:35:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 05:35:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:35:14.456579 543705 disk_worker.go:494] system disk:vda1
I0320 05:35:14.456610 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:35:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:35:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:35:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:35:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:35:16.472388 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:35:22.789588 543705 disk_info.go:125] begin check local disk info of client
I0320 05:35:22.792112 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:35:22.792120 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4b00 0xc0000c4b40]
E0320 05:35:23.409295 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:35:23.409314 543705 memory.go:184] no items to output this cycle
I0320 05:35:23.409329 543705 cpu.go:275] no items to output this cycle
E0320 05:35:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:35:33.409771 543705 memory.go:184] no items to output this cycle
I0320 05:35:33.409789 543705 cpu.go:275] no items to output this cycle
E0320 05:35:43.409917 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:35:43.409926 543705 cpu.go:282] Add success.
I0320 05:35:43.409969 543705 memory.go:191] Add success.
I0320 05:35:43.419710 543705 net.go:648] Add success.
I0320 05:35:43.422518 543705 net.go:770] primary dev: ETH0
I0320 05:35:43.422531 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:35:43.422542 543705 net.go:698] Add success.
I0320 05:35:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:35:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:35:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:35:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:35:53.409785 543705 memory.go:184] no items to output this cycle
I0320 05:35:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 05:36:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:36:03.409789 543705 memory.go:184] no items to output this cycle
I0320 05:36:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 05:36:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:36:13.409791 543705 memory.go:191] Add success.
I0320 05:36:13.409799 543705 cpu.go:282] Add success.
W0320 05:36:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:36:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:36:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:36:13.419887 543705 net.go:770] primary dev: ETH0
I0320 05:36:13.419901 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:36:13.419913 543705 net.go:698] Add success.
I0320 05:36:13.420143 543705 net.go:648] Add success.
I0320 05:36:13.470247 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c8b8e07e-38fb-4b27-a978-b2852083dd56","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 05:36:13.470280 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 05:36:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:36:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:36:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 05:36:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:36:14.456606 543705 disk_worker.go:494] system disk:vda1
I0320 05:36:14.456638 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:36:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:36:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:36:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:36:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:36:16.472408 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:36:22.792203 543705 disk_info.go:125] begin check local disk info of client
I0320 05:36:22.794740 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:36:22.794747 543705 disk_info.go:196] parse disk info done, disk is : [0xc000305340 0xc000305380]
E0320 05:36:23.408876 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:36:23.408891 543705 memory.go:184] no items to output this cycle
I0320 05:36:23.408899 543705 cpu.go:275] no items to output this cycle
E0320 05:36:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:36:33.409775 543705 memory.go:184] no items to output this cycle
I0320 05:36:33.409811 543705 cpu.go:275] no items to output this cycle
I0320 05:36:38.157731 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 05:36:38.157738 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 05:36:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:36:43.409965 543705 cpu.go:282] Add success.
I0320 05:36:43.410696 543705 memory.go:191] Add success.
I0320 05:36:43.419720 543705 net.go:648] Add success.
I0320 05:36:43.422638 543705 net.go:770] primary dev: ETH0
I0320 05:36:43.422650 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:36:43.422662 543705 net.go:698] Add success.
I0320 05:36:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:36:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:36:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:36:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:36:53.409776 543705 memory.go:184] no items to output this cycle
I0320 05:36:53.409776 543705 cpu.go:275] no items to output this cycle
E0320 05:37:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:37:03.409784 543705 memory.go:184] no items to output this cycle
I0320 05:37:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 05:37:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:37:13.409818 543705 memory.go:191] Add success.
I0320 05:37:13.409827 543705 cpu.go:282] Add success.
W0320 05:37:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:37:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:37:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:37:13.420221 543705 net.go:648] Add success.
I0320 05:37:13.422824 543705 net.go:770] primary dev: ETH0
I0320 05:37:13.422843 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:37:13.422857 543705 net.go:698] Add success.
I0320 05:37:13.453444 543705 event_worker.go:152] Polling the log file for events...
W0320 05:37:14.455108 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:37:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 05:37:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:37:14.456755 543705 disk_worker.go:494] system disk:vda1
I0320 05:37:14.456794 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 05:37:14.457119 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 05:37:14.457127 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 05:37:14.457132 543705 custom_config.go:64] query custom config with name: gpu
E0320 05:37:15.456832 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 05:37:15.456841 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:37:16.457937 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 05:37:16.457937 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 05:37:16.457992 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:37:16.458016 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:37:16.472350 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:37:22.796628 543705 disk_info.go:125] begin check local disk info of client
I0320 05:37:22.799175 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:37:22.799182 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001efac0 0xc0001efb00]
E0320 05:37:23.409333 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:37:23.409349 543705 memory.go:184] no items to output this cycle
I0320 05:37:23.409357 543705 cpu.go:275] no items to output this cycle
E0320 05:37:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:37:33.409781 543705 memory.go:184] no items to output this cycle
I0320 05:37:33.409789 543705 cpu.go:275] no items to output this cycle
E0320 05:37:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:37:43.409795 543705 memory.go:191] Add success.
I0320 05:37:43.409795 543705 cpu.go:282] Add success.
I0320 05:37:43.420072 543705 net.go:648] Add success.
I0320 05:37:43.423287 543705 net.go:770] primary dev: ETH0
I0320 05:37:43.423300 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:37:43.423312 543705 net.go:698] Add success.
I0320 05:37:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:37:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:37:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:37:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:37:53.409768 543705 memory.go:184] no items to output this cycle
I0320 05:37:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 05:38:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:38:03.409789 543705 memory.go:184] no items to output this cycle
I0320 05:38:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 05:38:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:38:13.409778 543705 memory.go:191] Add success.
W0320 05:38:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 05:38:13.409808 543705 cpu.go:282] Add success.
W0320 05:38:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:38:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:38:13.420162 543705 net.go:648] Add success.
I0320 05:38:13.422813 543705 net.go:770] primary dev: ETH0
I0320 05:38:13.422826 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:38:13.422839 543705 net.go:698] Add success.
I0320 05:38:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:38:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:38:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 05:38:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:38:14.456608 543705 disk_worker.go:494] system disk:vda1
I0320 05:38:14.456639 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:38:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:38:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:38:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:38:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:38:16.472368 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:38:22.800629 543705 disk_info.go:125] begin check local disk info of client
I0320 05:38:22.803106 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:38:22.803112 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034e500 0xc00034e540]
E0320 05:38:23.409189 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:38:23.409201 543705 memory.go:184] no items to output this cycle
I0320 05:38:23.409240 543705 cpu.go:275] no items to output this cycle
E0320 05:38:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:38:33.409795 543705 memory.go:184] no items to output this cycle
I0320 05:38:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 05:38:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:38:43.409795 543705 memory.go:191] Add success.
I0320 05:38:43.409795 543705 cpu.go:282] Add success.
I0320 05:38:43.420054 543705 net.go:648] Add success.
I0320 05:38:43.423124 543705 net.go:770] primary dev: ETH0
I0320 05:38:43.423144 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:38:43.423327 543705 net.go:698] Add success.
I0320 05:38:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:38:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:38:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:38:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:38:53.409781 543705 memory.go:184] no items to output this cycle
I0320 05:38:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 05:39:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:39:03.409796 543705 memory.go:184] no items to output this cycle
I0320 05:39:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 05:39:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:39:13.409827 543705 memory.go:191] Add success.
I0320 05:39:13.409832 543705 cpu.go:282] Add success.
W0320 05:39:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:39:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:39:13.409878 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:39:13.420113 543705 net.go:648] Add success.
I0320 05:39:13.422960 543705 net.go:770] primary dev: ETH0
I0320 05:39:13.422973 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:39:13.422986 543705 net.go:698] Add success.
I0320 05:39:13.865943 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3a16e799-f3e4-444b-bd9c-495b49987bde","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 05:39:13.865978 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 05:39:14.454731 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:39:14.454903 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:39:14.454913 543705 disk_worker.go:708] disk space is not compliant
W0320 05:39:14.454916 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:39:14.456252 543705 disk_worker.go:494] system disk:vda1
I0320 05:39:14.456297 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:39:15.455976 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:39:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:39:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:39:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:39:16.472385 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:39:22.803200 543705 disk_info.go:125] begin check local disk info of client
I0320 05:39:22.805836 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:39:22.805843 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4980 0xc0000c49c0]
E0320 05:39:23.408912 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:39:23.408927 543705 memory.go:184] no items to output this cycle
I0320 05:39:23.408963 543705 cpu.go:275] no items to output this cycle
E0320 05:39:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:39:33.409801 543705 memory.go:184] no items to output this cycle
I0320 05:39:33.409803 543705 cpu.go:275] no items to output this cycle
I0320 05:39:38.161488 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 05:39:38.161495 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 05:39:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:39:43.410719 543705 memory.go:191] Add success.
I0320 05:39:43.409831 543705 cpu.go:282] Add success.
I0320 05:39:43.420448 543705 net.go:648] Add success.
I0320 05:39:43.423391 543705 net.go:770] primary dev: ETH0
I0320 05:39:43.423407 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:39:43.423420 543705 net.go:698] Add success.
I0320 05:39:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:39:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:39:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:39:53.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:39:53.409808 543705 memory.go:184] no items to output this cycle
I0320 05:39:53.409817 543705 cpu.go:275] no items to output this cycle
E0320 05:40:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:40:03.409803 543705 memory.go:184] no items to output this cycle
I0320 05:40:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 05:40:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:40:13.409797 543705 memory.go:191] Add success.
I0320 05:40:13.409798 543705 cpu.go:282] Add success.
W0320 05:40:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:40:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:40:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:40:13.420157 543705 net.go:648] Add success.
I0320 05:40:13.422767 543705 net.go:770] primary dev: ETH0
I0320 05:40:13.422780 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:40:13.422793 543705 net.go:698] Add success.
I0320 05:40:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:40:14.455106 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:40:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 05:40:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:40:14.456568 543705 disk_worker.go:494] system disk:vda1
I0320 05:40:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:40:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:40:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:40:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:40:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:40:16.472383 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:40:22.807675 543705 disk_info.go:125] begin check local disk info of client
I0320 05:40:22.810147 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:40:22.810153 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5480 0xc0000c54c0]
E0320 05:40:23.409232 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:40:23.409249 543705 memory.go:184] no items to output this cycle
I0320 05:40:23.409265 543705 cpu.go:275] no items to output this cycle
E0320 05:40:33.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:40:33.409819 543705 memory.go:184] no items to output this cycle
I0320 05:40:33.409836 543705 cpu.go:275] no items to output this cycle
E0320 05:40:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:40:43.409831 543705 memory.go:191] Add success.
I0320 05:40:43.409838 543705 cpu.go:282] Add success.
I0320 05:40:43.419887 543705 net.go:648] Add success.
I0320 05:40:43.422562 543705 net.go:770] primary dev: ETH0
I0320 05:40:43.422576 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:40:43.422588 543705 net.go:698] Add success.
I0320 05:40:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:40:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:40:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:40:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:40:53.409817 543705 memory.go:184] no items to output this cycle
I0320 05:40:53.409825 543705 cpu.go:275] no items to output this cycle
E0320 05:41:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:41:03.409820 543705 memory.go:184] no items to output this cycle
I0320 05:41:03.409832 543705 cpu.go:275] no items to output this cycle
E0320 05:41:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:41:13.409818 543705 memory.go:191] Add success.
I0320 05:41:13.409825 543705 cpu.go:282] Add success.
W0320 05:41:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:41:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:41:13.409878 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:41:13.420158 543705 net.go:648] Add success.
I0320 05:41:13.423089 543705 net.go:770] primary dev: ETH0
I0320 05:41:13.423103 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:41:13.423117 543705 net.go:698] Add success.
I0320 05:41:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:41:14.455204 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:41:14.455215 543705 disk_worker.go:708] disk space is not compliant
W0320 05:41:14.455217 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:41:14.456593 543705 disk_worker.go:494] system disk:vda1
I0320 05:41:14.456623 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:41:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:41:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:41:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:41:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:41:16.472433 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:41:22.811694 543705 disk_info.go:125] begin check local disk info of client
I0320 05:41:22.814279 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:41:22.814287 543705 disk_info.go:196] parse disk info done, disk is : [0xc0005559c0 0xc000555a00]
E0320 05:41:23.409311 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:41:23.409325 543705 memory.go:184] no items to output this cycle
I0320 05:41:23.409361 543705 cpu.go:275] no items to output this cycle
E0320 05:41:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:41:33.409805 543705 memory.go:184] no items to output this cycle
I0320 05:41:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 05:41:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:41:43.409811 543705 memory.go:191] Add success.
I0320 05:41:43.409820 543705 cpu.go:282] Add success.
I0320 05:41:43.419902 543705 net.go:648] Add success.
I0320 05:41:43.422681 543705 net.go:770] primary dev: ETH0
I0320 05:41:43.422695 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:41:43.422715 543705 net.go:698] Add success.
I0320 05:41:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:41:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:41:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:41:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:41:53.409773 543705 memory.go:184] no items to output this cycle
I0320 05:41:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 05:42:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:42:03.409785 543705 memory.go:184] no items to output this cycle
I0320 05:42:03.409914 543705 cpu.go:275] no items to output this cycle
E0320 05:42:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:42:13.409798 543705 memory.go:191] Add success.
I0320 05:42:13.409800 543705 cpu.go:282] Add success.
W0320 05:42:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:42:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:42:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:42:13.420186 543705 net.go:648] Add success.
I0320 05:42:13.423018 543705 net.go:770] primary dev: ETH0
I0320 05:42:13.423031 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:42:13.423043 543705 net.go:698] Add success.
I0320 05:42:13.464556 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"13268bb6-f15b-4491-b2af-081431c34c9e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 05:42:13.464597 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 05:42:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:42:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 05:42:14.455190 543705 disk_worker.go:728] disk inode is not compliant
E0320 05:42:14.456854 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
I0320 05:42:14.456870 543705 disk_worker.go:494] system disk:vda1
E0320 05:42:14.456863 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 05:42:14.456880 543705 custom_config.go:64] query custom config with name: gpu
I0320 05:42:14.456908 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 05:42:15.456787 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 05:42:15.456796 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:42:16.457962 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 05:42:16.457960 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 05:42:16.458015 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:42:16.458034 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:42:16.472350 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:42:22.814370 543705 disk_info.go:125] begin check local disk info of client
I0320 05:42:22.816784 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:42:22.816790 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aaf80 0xc0001aafc0]
E0320 05:42:23.408826 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:42:23.408841 543705 memory.go:184] no items to output this cycle
I0320 05:42:23.408855 543705 cpu.go:275] no items to output this cycle
E0320 05:42:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:42:33.409795 543705 memory.go:184] no items to output this cycle
I0320 05:42:33.409801 543705 cpu.go:275] no items to output this cycle
I0320 05:42:38.161732 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 05:42:38.161739 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 05:42:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:42:43.410674 543705 memory.go:191] Add success.
I0320 05:42:43.409810 543705 cpu.go:282] Add success.
I0320 05:42:43.420368 543705 net.go:648] Add success.
I0320 05:42:43.423137 543705 net.go:770] primary dev: ETH0
I0320 05:42:43.423151 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:42:43.423165 543705 net.go:698] Add success.
I0320 05:42:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:42:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:42:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:42:53.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:42:53.409763 543705 memory.go:184] no items to output this cycle
I0320 05:42:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 05:43:03.409932 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:43:03.410001 543705 cpu.go:275] no items to output this cycle
I0320 05:43:03.410039 543705 memory.go:184] no items to output this cycle
E0320 05:43:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:43:13.409783 543705 memory.go:191] Add success.
W0320 05:43:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 05:43:13.409815 543705 cpu.go:282] Add success.
W0320 05:43:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:43:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:43:13.419940 543705 net.go:770] primary dev: ETH0
I0320 05:43:13.419951 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:43:13.419964 543705 net.go:698] Add success.
I0320 05:43:13.420205 543705 net.go:648] Add success.
I0320 05:43:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:43:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:43:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0320 05:43:14.455171 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:43:14.456492 543705 disk_worker.go:494] system disk:vda1
I0320 05:43:14.456535 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:43:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:43:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:43:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:43:16.458074 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:43:16.472410 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:43:22.817678 543705 disk_info.go:125] begin check local disk info of client
I0320 05:43:22.820190 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:43:22.820197 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000ed600 0xc0000ed640]
E0320 05:43:23.409229 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:43:23.409246 543705 memory.go:184] no items to output this cycle
I0320 05:43:23.409260 543705 cpu.go:275] no items to output this cycle
E0320 05:43:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:43:33.409802 543705 memory.go:184] no items to output this cycle
I0320 05:43:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 05:43:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:43:43.409792 543705 memory.go:191] Add success.
I0320 05:43:43.409791 543705 cpu.go:282] Add success.
I0320 05:43:43.419906 543705 net.go:648] Add success.
I0320 05:43:43.422505 543705 net.go:770] primary dev: ETH0
I0320 05:43:43.422520 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:43:43.422533 543705 net.go:698] Add success.
I0320 05:43:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:43:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:43:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:43:53.410267 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:43:53.410284 543705 memory.go:184] no items to output this cycle
I0320 05:43:53.410286 543705 cpu.go:275] no items to output this cycle
E0320 05:44:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:44:03.409780 543705 memory.go:184] no items to output this cycle
I0320 05:44:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 05:44:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:44:13.409790 543705 memory.go:191] Add success.
I0320 05:44:13.409791 543705 cpu.go:282] Add success.
W0320 05:44:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:44:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:44:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:44:13.420122 543705 net.go:648] Add success.
I0320 05:44:13.422757 543705 net.go:770] primary dev: ETH0
I0320 05:44:13.422770 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:44:13.422782 543705 net.go:698] Add success.
I0320 05:44:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:44:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:44:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 05:44:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:44:14.456615 543705 disk_worker.go:494] system disk:vda1
I0320 05:44:14.456645 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:44:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:44:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:44:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:44:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:44:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:44:22.821673 543705 disk_info.go:125] begin check local disk info of client
I0320 05:44:22.824099 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:44:22.824105 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b580 0xc00007b5c0]
E0320 05:44:23.409115 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:44:23.409130 543705 memory.go:184] no items to output this cycle
I0320 05:44:23.409144 543705 cpu.go:275] no items to output this cycle
E0320 05:44:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:44:33.409783 543705 memory.go:184] no items to output this cycle
I0320 05:44:33.409786 543705 cpu.go:275] no items to output this cycle
E0320 05:44:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:44:43.409796 543705 memory.go:191] Add success.
I0320 05:44:43.409797 543705 cpu.go:282] Add success.
I0320 05:44:43.419875 543705 net.go:648] Add success.
I0320 05:44:43.422233 543705 net.go:770] primary dev: ETH0
I0320 05:44:43.422246 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:44:43.422261 543705 net.go:698] Add success.
I0320 05:44:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:44:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:44:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:44:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:44:53.409784 543705 memory.go:184] no items to output this cycle
I0320 05:44:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 05:45:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:45:03.409787 543705 memory.go:184] no items to output this cycle
I0320 05:45:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 05:45:13.409884 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:45:13.409912 543705 memory.go:191] Add success.
W0320 05:45:13.409943 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:45:13.409956 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:45:13.409963 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:45:13.409970 543705 cpu.go:282] Add success.
I0320 05:45:13.419706 543705 net.go:648] Add success.
I0320 05:45:13.422269 543705 net.go:770] primary dev: ETH0
I0320 05:45:13.422281 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:45:13.422292 543705 net.go:698] Add success.
I0320 05:45:13.467836 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7862e44e-dee9-4d00-b07b-af09fb8f522d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 05:45:13.467867 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 05:45:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:45:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:45:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0320 05:45:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:45:14.456554 543705 disk_worker.go:494] system disk:vda1
I0320 05:45:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:45:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:45:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:45:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:45:16.458049 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:45:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:45:22.825678 543705 disk_info.go:125] begin check local disk info of client
I0320 05:45:22.828220 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:45:22.828227 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5740 0xc0000c5780]
E0320 05:45:23.407515 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:45:23.407529 543705 memory.go:184] no items to output this cycle
I0320 05:45:23.407550 543705 cpu.go:275] no items to output this cycle
E0320 05:45:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:45:33.409803 543705 memory.go:184] no items to output this cycle
I0320 05:45:33.409815 543705 cpu.go:275] no items to output this cycle
I0320 05:45:38.165507 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 05:45:38.165514 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 05:45:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:45:43.410623 543705 memory.go:191] Add success.
I0320 05:45:43.409809 543705 cpu.go:282] Add success.
I0320 05:45:43.420373 543705 net.go:648] Add success.
I0320 05:45:43.423156 543705 net.go:770] primary dev: ETH0
I0320 05:45:43.423168 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:45:43.423181 543705 net.go:698] Add success.
I0320 05:45:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:45:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:45:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:45:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:45:53.409766 543705 memory.go:184] no items to output this cycle
I0320 05:45:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 05:46:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:46:03.409782 543705 memory.go:184] no items to output this cycle
I0320 05:46:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 05:46:13.409862 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:46:13.409888 543705 memory.go:191] Add success.
W0320 05:46:13.409924 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:46:13.409943 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:46:13.409954 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:46:13.409973 543705 cpu.go:282] Add success.
I0320 05:46:13.419722 543705 net.go:648] Add success.
I0320 05:46:13.422636 543705 net.go:770] primary dev: ETH0
I0320 05:46:13.422648 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:46:13.422660 543705 net.go:698] Add success.
I0320 05:46:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:46:14.455097 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:46:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0320 05:46:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:46:14.456585 543705 disk_worker.go:494] system disk:vda1
I0320 05:46:14.456614 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:46:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:46:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:46:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:46:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:46:16.472436 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:46:22.829675 543705 disk_info.go:125] begin check local disk info of client
I0320 05:46:22.832158 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:46:22.832165 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b2cc0 0xc0002b2d00]
E0320 05:46:23.409109 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:46:23.409124 543705 memory.go:184] no items to output this cycle
I0320 05:46:23.409149 543705 cpu.go:275] no items to output this cycle
E0320 05:46:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:46:33.409779 543705 memory.go:184] no items to output this cycle
I0320 05:46:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 05:46:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:46:43.409793 543705 memory.go:191] Add success.
I0320 05:46:43.409793 543705 cpu.go:282] Add success.
I0320 05:46:43.419860 543705 net.go:648] Add success.
I0320 05:46:43.422602 543705 net.go:770] primary dev: ETH0
I0320 05:46:43.422615 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:46:43.422626 543705 net.go:698] Add success.
I0320 05:46:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:46:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:46:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:46:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:46:53.409789 543705 memory.go:184] no items to output this cycle
I0320 05:46:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 05:47:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:47:03.409789 543705 memory.go:184] no items to output this cycle
I0320 05:47:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 05:47:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:47:13.409788 543705 memory.go:191] Add success.
I0320 05:47:13.409792 543705 cpu.go:282] Add success.
W0320 05:47:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:47:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:47:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:47:13.420192 543705 net.go:648] Add success.
I0320 05:47:13.423165 543705 net.go:770] primary dev: ETH0
I0320 05:47:13.423178 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:47:13.423189 543705 net.go:698] Add success.
I0320 05:47:13.452880 543705 event_worker.go:152] Polling the log file for events...
W0320 05:47:14.455119 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:47:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0320 05:47:14.455180 543705 disk_worker.go:728] disk inode is not compliant
E0320 05:47:14.455920 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 05:47:14.455928 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 05:47:14.455934 543705 custom_config.go:64] query custom config with name: gpu
I0320 05:47:14.456551 543705 disk_worker.go:494] system disk:vda1
I0320 05:47:14.456600 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 05:47:15.456820 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 05:47:15.456829 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:47:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 05:47:16.457989 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 05:47:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:47:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:47:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:47:22.833678 543705 disk_info.go:125] begin check local disk info of client
I0320 05:47:22.836160 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:47:22.836167 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003c8640 0xc0003c8680]
E0320 05:47:23.407525 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:47:23.407539 543705 memory.go:184] no items to output this cycle
I0320 05:47:23.407551 543705 cpu.go:275] no items to output this cycle
E0320 05:47:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:47:33.409768 543705 memory.go:184] no items to output this cycle
I0320 05:47:33.409791 543705 cpu.go:275] no items to output this cycle
E0320 05:47:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:47:43.409808 543705 memory.go:191] Add success.
I0320 05:47:43.409817 543705 cpu.go:282] Add success.
I0320 05:47:43.419852 543705 net.go:648] Add success.
I0320 05:47:43.422378 543705 net.go:770] primary dev: ETH0
I0320 05:47:43.422391 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:47:43.422404 543705 net.go:698] Add success.
I0320 05:47:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:47:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:47:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:47:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:47:53.409795 543705 memory.go:184] no items to output this cycle
I0320 05:47:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 05:48:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:48:03.409784 543705 memory.go:184] no items to output this cycle
I0320 05:48:03.409802 543705 cpu.go:275] no items to output this cycle
E0320 05:48:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:48:13.409811 543705 memory.go:191] Add success.
I0320 05:48:13.409815 543705 cpu.go:282] Add success.
W0320 05:48:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:48:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:48:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:48:13.420583 543705 net.go:648] Add success.
I0320 05:48:13.423377 543705 net.go:770] primary dev: ETH0
I0320 05:48:13.423390 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:48:13.423402 543705 net.go:698] Add success.
I0320 05:48:13.463098 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"fe146765-ccd5-4fbe-9116-4b202a2d5f42","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 05:48:13.463128 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 05:48:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:48:14.455134 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:48:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0320 05:48:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:48:14.456600 543705 disk_worker.go:494] system disk:vda1
I0320 05:48:14.456628 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:48:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:48:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:48:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:48:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:48:16.472366 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:48:22.837676 543705 disk_info.go:125] begin check local disk info of client
I0320 05:48:22.840133 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:48:22.840140 543705 disk_info.go:196] parse disk info done, disk is : [0xc000518b80 0xc000518bc0]
E0320 05:48:23.409069 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:48:23.409087 543705 memory.go:184] no items to output this cycle
I0320 05:48:23.409099 543705 cpu.go:275] no items to output this cycle
E0320 05:48:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:48:33.409781 543705 memory.go:184] no items to output this cycle
I0320 05:48:33.409786 543705 cpu.go:275] no items to output this cycle
I0320 05:48:38.165734 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 05:48:38.165740 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 05:48:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:48:43.410737 543705 memory.go:191] Add success.
I0320 05:48:43.409793 543705 cpu.go:282] Add success.
I0320 05:48:43.420440 543705 net.go:648] Add success.
I0320 05:48:43.423047 543705 net.go:770] primary dev: ETH0
I0320 05:48:43.423062 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:48:43.423075 543705 net.go:698] Add success.
I0320 05:48:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:48:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:48:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:48:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:48:53.409801 543705 memory.go:184] no items to output this cycle
I0320 05:48:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 05:49:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:49:03.409787 543705 memory.go:184] no items to output this cycle
I0320 05:49:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 05:49:13.409889 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:49:13.409928 543705 memory.go:191] Add success.
I0320 05:49:13.409944 543705 cpu.go:282] Add success.
W0320 05:49:13.410092 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:49:13.410106 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:49:13.410109 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:49:13.419722 543705 net.go:648] Add success.
I0320 05:49:13.422507 543705 net.go:770] primary dev: ETH0
I0320 05:49:13.422522 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:49:13.422536 543705 net.go:698] Add success.
I0320 05:49:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:49:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:49:14.455160 543705 disk_worker.go:708] disk space is not compliant
W0320 05:49:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:49:14.456496 543705 disk_worker.go:494] system disk:vda1
I0320 05:49:14.456538 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:49:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:49:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:49:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:49:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:49:16.472415 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:49:22.841680 543705 disk_info.go:125] begin check local disk info of client
I0320 05:49:22.844239 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:49:22.844246 543705 disk_info.go:196] parse disk info done, disk is : [0xc000464740 0xc000464780]
E0320 05:49:23.409126 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:49:23.409140 543705 memory.go:184] no items to output this cycle
I0320 05:49:23.409174 543705 cpu.go:275] no items to output this cycle
E0320 05:49:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:49:33.409770 543705 memory.go:184] no items to output this cycle
I0320 05:49:33.409793 543705 cpu.go:275] no items to output this cycle
E0320 05:49:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:49:43.409779 543705 memory.go:191] Add success.
I0320 05:49:43.409800 543705 cpu.go:282] Add success.
I0320 05:49:43.419892 543705 net.go:648] Add success.
I0320 05:49:43.422539 543705 net.go:770] primary dev: ETH0
I0320 05:49:43.422553 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:49:43.422566 543705 net.go:698] Add success.
I0320 05:49:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:49:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:49:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:49:53.410333 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:49:53.410347 543705 memory.go:184] no items to output this cycle
I0320 05:49:53.410351 543705 cpu.go:275] no items to output this cycle
E0320 05:50:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:50:03.409789 543705 memory.go:184] no items to output this cycle
I0320 05:50:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 05:50:13.409885 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:50:13.409897 543705 cpu.go:282] Add success.
I0320 05:50:13.409911 543705 memory.go:191] Add success.
W0320 05:50:13.409969 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:50:13.409991 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:50:13.409996 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:50:13.419767 543705 net.go:648] Add success.
I0320 05:50:13.422638 543705 net.go:770] primary dev: ETH0
I0320 05:50:13.422652 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:50:13.422675 543705 net.go:698] Add success.
I0320 05:50:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:50:14.455078 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:50:14.455142 543705 disk_worker.go:708] disk space is not compliant
W0320 05:50:14.455145 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:50:14.456460 543705 disk_worker.go:494] system disk:vda1
I0320 05:50:14.456504 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:50:15.455974 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:50:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:50:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:50:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:50:16.472374 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:50:22.845688 543705 disk_info.go:125] begin check local disk info of client
I0320 05:50:22.848204 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:50:22.848211 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa280 0xc0001aa2c0]
E0320 05:50:23.409088 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:50:23.409096 543705 cpu.go:275] no items to output this cycle
I0320 05:50:23.409101 543705 memory.go:184] no items to output this cycle
E0320 05:50:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:50:33.409812 543705 memory.go:184] no items to output this cycle
I0320 05:50:33.409827 543705 cpu.go:275] no items to output this cycle
E0320 05:50:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:50:43.409794 543705 memory.go:191] Add success.
I0320 05:50:43.409809 543705 cpu.go:282] Add success.
I0320 05:50:43.419936 543705 net.go:648] Add success.
I0320 05:50:43.422475 543705 net.go:770] primary dev: ETH0
I0320 05:50:43.422487 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:50:43.422499 543705 net.go:698] Add success.
I0320 05:50:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:50:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:50:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:50:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:50:53.409784 543705 memory.go:184] no items to output this cycle
I0320 05:50:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 05:51:03.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:51:03.409818 543705 memory.go:184] no items to output this cycle
I0320 05:51:03.409828 543705 cpu.go:275] no items to output this cycle
E0320 05:51:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:51:13.409807 543705 memory.go:191] Add success.
I0320 05:51:13.409823 543705 cpu.go:282] Add success.
W0320 05:51:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:51:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:51:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:51:13.420058 543705 net.go:648] Add success.
I0320 05:51:13.422713 543705 net.go:770] primary dev: ETH0
I0320 05:51:13.422728 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:51:13.422751 543705 net.go:698] Add success.
I0320 05:51:13.463791 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3e12b123-ad00-4401-b136-8939e4332714","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 05:51:13.463823 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 05:51:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:51:14.455167 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:51:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 05:51:14.455182 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:51:14.456585 543705 disk_worker.go:494] system disk:vda1
I0320 05:51:14.456617 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:51:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:51:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:51:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:51:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:51:16.472383 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:51:22.849680 543705 disk_info.go:125] begin check local disk info of client
I0320 05:51:22.852310 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:51:22.852316 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007ad40 0xc00007ad80]
E0320 05:51:23.407869 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:51:23.407883 543705 memory.go:184] no items to output this cycle
I0320 05:51:23.407894 543705 cpu.go:275] no items to output this cycle
E0320 05:51:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:51:33.409796 543705 memory.go:184] no items to output this cycle
I0320 05:51:33.409800 543705 cpu.go:275] no items to output this cycle
I0320 05:51:38.165882 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 05:51:38.165888 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 05:51:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:51:43.410590 543705 memory.go:191] Add success.
I0320 05:51:43.409824 543705 cpu.go:282] Add success.
I0320 05:51:43.420260 543705 net.go:648] Add success.
I0320 05:51:43.422701 543705 net.go:770] primary dev: ETH0
I0320 05:51:43.422714 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:51:43.422727 543705 net.go:698] Add success.
I0320 05:51:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:51:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:51:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:51:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:51:53.409777 543705 cpu.go:275] no items to output this cycle
I0320 05:51:53.409783 543705 memory.go:184] no items to output this cycle
E0320 05:52:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:52:03.409794 543705 memory.go:184] no items to output this cycle
I0320 05:52:03.409819 543705 cpu.go:275] no items to output this cycle
E0320 05:52:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:52:13.409797 543705 memory.go:191] Add success.
W0320 05:52:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 05:52:13.409842 543705 cpu.go:282] Add success.
W0320 05:52:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:52:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:52:13.420008 543705 net.go:770] primary dev: ETH0
I0320 05:52:13.420023 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:52:13.420037 543705 net.go:698] Add success.
I0320 05:52:13.420380 543705 net.go:648] Add success.
W0320 05:52:14.455166 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:52:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0320 05:52:14.455179 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:52:14.456851 543705 disk_worker.go:494] system disk:vda1
I0320 05:52:14.456901 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 05:52:14.457126 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 05:52:14.457134 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 05:52:14.457139 543705 custom_config.go:64] query custom config with name: gpu
E0320 05:52:15.456809 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 05:52:15.456819 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:52:16.457920 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 05:52:16.457919 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 05:52:16.457972 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:52:16.457991 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:52:16.472283 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:52:22.853677 543705 disk_info.go:125] begin check local disk info of client
I0320 05:52:22.856167 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:52:22.856174 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037e880 0xc00037e8c0]
E0320 05:52:23.409020 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:52:23.409037 543705 memory.go:184] no items to output this cycle
I0320 05:52:23.409053 543705 cpu.go:275] no items to output this cycle
E0320 05:52:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:52:33.409789 543705 cpu.go:275] no items to output this cycle
I0320 05:52:33.409803 543705 memory.go:184] no items to output this cycle
E0320 05:52:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:52:43.409793 543705 memory.go:191] Add success.
I0320 05:52:43.409794 543705 cpu.go:282] Add success.
I0320 05:52:43.419854 543705 net.go:648] Add success.
I0320 05:52:43.422527 543705 net.go:770] primary dev: ETH0
I0320 05:52:43.422540 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:52:43.422552 543705 net.go:698] Add success.
I0320 05:52:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:52:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:52:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:52:53.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:52:53.409760 543705 memory.go:184] no items to output this cycle
I0320 05:52:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 05:53:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:53:03.409922 543705 memory.go:184] no items to output this cycle
I0320 05:53:03.409934 543705 cpu.go:275] no items to output this cycle
E0320 05:53:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:53:13.409786 543705 memory.go:191] Add success.
I0320 05:53:13.409804 543705 cpu.go:282] Add success.
W0320 05:53:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:53:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:53:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:53:13.420138 543705 net.go:648] Add success.
I0320 05:53:13.422841 543705 net.go:770] primary dev: ETH0
I0320 05:53:13.422855 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:53:13.422867 543705 net.go:698] Add success.
I0320 05:53:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:53:14.455167 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:53:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0320 05:53:14.455180 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:53:14.456559 543705 disk_worker.go:494] system disk:vda1
I0320 05:53:14.456589 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:53:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:53:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:53:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:53:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:53:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:53:22.857677 543705 disk_info.go:125] begin check local disk info of client
I0320 05:53:22.860308 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:53:22.860314 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003f4680 0xc0003f46c0]
E0320 05:53:23.409125 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:53:23.409139 543705 memory.go:184] no items to output this cycle
I0320 05:53:23.409154 543705 cpu.go:275] no items to output this cycle
E0320 05:53:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:53:33.409776 543705 memory.go:184] no items to output this cycle
I0320 05:53:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 05:53:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:53:43.409809 543705 memory.go:191] Add success.
I0320 05:53:43.409822 543705 cpu.go:282] Add success.
I0320 05:53:43.419950 543705 net.go:648] Add success.
I0320 05:53:43.422631 543705 net.go:770] primary dev: ETH0
I0320 05:53:43.422644 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:53:43.422657 543705 net.go:698] Add success.
I0320 05:53:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:53:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:53:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:53:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:53:53.409776 543705 memory.go:184] no items to output this cycle
I0320 05:53:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 05:54:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:54:03.409883 543705 memory.go:184] no items to output this cycle
I0320 05:54:03.409928 543705 cpu.go:275] no items to output this cycle
E0320 05:54:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:54:13.409789 543705 cpu.go:282] Add success.
I0320 05:54:13.409799 543705 memory.go:191] Add success.
W0320 05:54:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:54:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:54:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:54:13.420158 543705 net.go:648] Add success.
I0320 05:54:13.423167 543705 net.go:770] primary dev: ETH0
I0320 05:54:13.423180 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:54:13.423192 543705 net.go:698] Add success.
I0320 05:54:13.532265 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"25352091-a420-4e20-96ff-472b152912fc","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 05:54:13.532298 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 05:54:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:54:14.455185 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:54:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 05:54:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:54:14.456563 543705 disk_worker.go:494] system disk:vda1
I0320 05:54:14.456592 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:54:15.455604 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:54:16.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:54:16.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:54:16.458091 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:54:16.472453 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:54:22.861674 543705 disk_info.go:125] begin check local disk info of client
I0320 05:54:22.864203 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:54:22.864209 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001c7080 0xc0001c70c0]
E0320 05:54:23.407519 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:54:23.407531 543705 memory.go:184] no items to output this cycle
I0320 05:54:23.407566 543705 cpu.go:275] no items to output this cycle
E0320 05:54:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:54:33.409810 543705 memory.go:184] no items to output this cycle
I0320 05:54:33.409815 543705 cpu.go:275] no items to output this cycle
I0320 05:54:38.166024 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 05:54:38.166030 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 05:54:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:54:43.410645 543705 memory.go:191] Add success.
I0320 05:54:43.409815 543705 cpu.go:282] Add success.
I0320 05:54:43.420339 543705 net.go:648] Add success.
I0320 05:54:43.422934 543705 net.go:770] primary dev: ETH0
I0320 05:54:43.422947 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:54:43.422960 543705 net.go:698] Add success.
I0320 05:54:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:54:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:54:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:54:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:54:53.409789 543705 memory.go:184] no items to output this cycle
I0320 05:54:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 05:55:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:55:03.409792 543705 memory.go:184] no items to output this cycle
I0320 05:55:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 05:55:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:55:13.409779 543705 memory.go:191] Add success.
W0320 05:55:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 05:55:13.409809 543705 cpu.go:282] Add success.
W0320 05:55:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:55:13.409818 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:55:13.419953 543705 net.go:770] primary dev: ETH0
I0320 05:55:13.419966 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:55:13.419978 543705 net.go:698] Add success.
I0320 05:55:13.420327 543705 net.go:648] Add success.
I0320 05:55:14.453926 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:55:14.455213 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:55:14.455225 543705 disk_worker.go:708] disk space is not compliant
W0320 05:55:14.455227 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:55:14.456593 543705 disk_worker.go:494] system disk:vda1
I0320 05:55:14.456638 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:55:15.455985 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:55:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:55:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:55:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:55:16.472457 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:55:22.865675 543705 disk_info.go:125] begin check local disk info of client
I0320 05:55:22.868222 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:55:22.868229 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007ad40 0xc00007ad80]
E0320 05:55:23.409006 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:55:23.409021 543705 memory.go:184] no items to output this cycle
I0320 05:55:23.409029 543705 cpu.go:275] no items to output this cycle
E0320 05:55:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:55:33.409777 543705 memory.go:184] no items to output this cycle
I0320 05:55:33.409780 543705 cpu.go:275] no items to output this cycle
E0320 05:55:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:55:43.409809 543705 memory.go:191] Add success.
I0320 05:55:43.409816 543705 cpu.go:282] Add success.
I0320 05:55:43.419897 543705 net.go:648] Add success.
I0320 05:55:43.422548 543705 net.go:770] primary dev: ETH0
I0320 05:55:43.422563 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:55:43.422577 543705 net.go:698] Add success.
I0320 05:55:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:55:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:55:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:55:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:55:53.409776 543705 memory.go:184] no items to output this cycle
I0320 05:55:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 05:56:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:56:03.409799 543705 memory.go:184] no items to output this cycle
I0320 05:56:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 05:56:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:56:13.409797 543705 memory.go:191] Add success.
I0320 05:56:13.409812 543705 cpu.go:282] Add success.
W0320 05:56:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:56:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:56:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:56:13.420163 543705 net.go:648] Add success.
I0320 05:56:13.422974 543705 net.go:770] primary dev: ETH0
I0320 05:56:13.422988 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:56:13.422999 543705 net.go:698] Add success.
I0320 05:56:14.454991 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:56:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:56:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 05:56:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:56:14.456528 543705 disk_worker.go:494] system disk:vda1
I0320 05:56:14.456576 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:56:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:56:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:56:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:56:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:56:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:56:22.869675 543705 disk_info.go:125] begin check local disk info of client
I0320 05:56:22.872141 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:56:22.872148 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047a500 0xc00047a540]
E0320 05:56:23.407522 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:56:23.407539 543705 memory.go:184] no items to output this cycle
I0320 05:56:23.407555 543705 cpu.go:275] no items to output this cycle
E0320 05:56:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:56:33.409795 543705 memory.go:184] no items to output this cycle
I0320 05:56:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 05:56:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:56:43.409827 543705 memory.go:191] Add success.
I0320 05:56:43.409828 543705 cpu.go:282] Add success.
I0320 05:56:43.419899 543705 net.go:648] Add success.
I0320 05:56:43.422620 543705 net.go:770] primary dev: ETH0
I0320 05:56:43.422632 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:56:43.422644 543705 net.go:698] Add success.
I0320 05:56:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:56:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:56:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:56:53.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:56:53.409808 543705 memory.go:184] no items to output this cycle
I0320 05:56:53.409817 543705 cpu.go:275] no items to output this cycle
E0320 05:57:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:57:03.409800 543705 memory.go:184] no items to output this cycle
I0320 05:57:03.409829 543705 cpu.go:275] no items to output this cycle
E0320 05:57:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:57:13.409817 543705 memory.go:191] Add success.
I0320 05:57:13.409823 543705 cpu.go:282] Add success.
W0320 05:57:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:57:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:57:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:57:13.419770 543705 net.go:648] Add success.
I0320 05:57:13.428685 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 05:57:13.428768 543705 net.go:770] primary dev: ETH0
I0320 05:57:13.428781 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:57:13.428792 543705 net.go:698] Add success.
I0320 05:57:13.453393 543705 event_worker.go:152] Polling the log file for events...
I0320 05:57:13.463773 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f0c3e47e-a4fd-4f25-a177-95b29d9f1a40","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 05:57:13.463804 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 05:57:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:57:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 05:57:14.455186 543705 disk_worker.go:728] disk inode is not compliant
E0320 05:57:14.456792 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 05:57:14.456802 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 05:57:14.456807 543705 custom_config.go:64] query custom config with name: gpu
I0320 05:57:14.456857 543705 disk_worker.go:494] system disk:vda1
I0320 05:57:14.456897 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 05:57:15.456819 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 05:57:15.456828 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:57:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 05:57:16.457977 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 05:57:16.458020 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:57:16.458036 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:57:16.472385 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:57:22.873678 543705 disk_info.go:125] begin check local disk info of client
I0320 05:57:22.876275 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:57:22.876282 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001c7780 0xc0001c77c0]
E0320 05:57:23.408997 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:57:23.409009 543705 memory.go:184] no items to output this cycle
I0320 05:57:23.409043 543705 cpu.go:275] no items to output this cycle
E0320 05:57:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:57:33.409764 543705 memory.go:184] no items to output this cycle
I0320 05:57:33.409810 543705 cpu.go:275] no items to output this cycle
I0320 05:57:38.169534 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 05:57:38.169540 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 05:57:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:57:43.409778 543705 memory.go:191] Add success.
I0320 05:57:43.409797 543705 cpu.go:282] Add success.
I0320 05:57:43.419866 543705 net.go:648] Add success.
I0320 05:57:43.420764 543705 net.go:770] primary dev: ETH0
I0320 05:57:43.420777 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:57:43.420804 543705 net.go:698] Add success.
I0320 05:57:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:57:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:57:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:57:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:57:53.409775 543705 memory.go:184] no items to output this cycle
I0320 05:57:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 05:58:03.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:58:03.409825 543705 memory.go:184] no items to output this cycle
I0320 05:58:03.409838 543705 cpu.go:275] no items to output this cycle
E0320 05:58:13.409854 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:58:13.409882 543705 memory.go:191] Add success.
W0320 05:58:13.409910 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 05:58:13.409922 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:58:13.409929 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:58:13.409932 543705 cpu.go:282] Add success.
I0320 05:58:13.419706 543705 net.go:648] Add success.
I0320 05:58:13.422383 543705 net.go:770] primary dev: ETH0
I0320 05:58:13.422397 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:58:13.422409 543705 net.go:698] Add success.
I0320 05:58:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:58:14.455151 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:58:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0320 05:58:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:58:14.456537 543705 disk_worker.go:494] system disk:vda1
I0320 05:58:14.456567 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:58:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:58:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:58:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:58:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:58:16.472413 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:58:22.877674 543705 disk_info.go:125] begin check local disk info of client
I0320 05:58:22.880175 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:58:22.880183 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa740 0xc0001aa780]
E0320 05:58:23.408880 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:58:23.408894 543705 memory.go:184] no items to output this cycle
I0320 05:58:23.408924 543705 cpu.go:275] no items to output this cycle
E0320 05:58:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:58:33.409789 543705 memory.go:184] no items to output this cycle
I0320 05:58:33.409800 543705 cpu.go:275] no items to output this cycle
E0320 05:58:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:58:43.409793 543705 memory.go:191] Add success.
I0320 05:58:43.409797 543705 cpu.go:282] Add success.
I0320 05:58:43.419995 543705 net.go:648] Add success.
I0320 05:58:43.423122 543705 net.go:770] primary dev: ETH0
I0320 05:58:43.423134 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:58:43.423148 543705 net.go:698] Add success.
I0320 05:58:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:58:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:58:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:58:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:58:53.409769 543705 memory.go:184] no items to output this cycle
I0320 05:58:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 05:59:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:59:03.409787 543705 memory.go:184] no items to output this cycle
I0320 05:59:03.409819 543705 cpu.go:275] no items to output this cycle
E0320 05:59:13.409864 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:59:13.409907 543705 memory.go:191] Add success.
W0320 05:59:13.409944 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 05:59:13.409960 543705 cpu.go:282] Add success.
W0320 05:59:13.409968 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 05:59:13.409972 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 05:59:13.419711 543705 net.go:648] Add success.
I0320 05:59:13.422760 543705 net.go:770] primary dev: ETH0
I0320 05:59:13.422773 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:59:13.422784 543705 net.go:698] Add success.
I0320 05:59:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 05:59:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 05:59:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 05:59:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 05:59:14.456570 543705 disk_worker.go:494] system disk:vda1
I0320 05:59:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 05:59:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 05:59:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:59:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:59:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 05:59:16.472449 543705 disk_local_worker.go:436] Get disk info: []
I0320 05:59:22.881675 543705 disk_info.go:125] begin check local disk info of client
I0320 05:59:22.884251 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 05:59:22.884258 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004c5600 0xc0004c5640]
E0320 05:59:23.408980 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:59:23.408997 543705 memory.go:184] no items to output this cycle
I0320 05:59:23.409012 543705 cpu.go:275] no items to output this cycle
E0320 05:59:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:59:33.409768 543705 memory.go:184] no items to output this cycle
I0320 05:59:33.409787 543705 cpu.go:275] no items to output this cycle
E0320 05:59:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:59:43.409811 543705 memory.go:191] Add success.
I0320 05:59:43.409819 543705 cpu.go:282] Add success.
I0320 05:59:43.419960 543705 net.go:648] Add success.
I0320 05:59:43.423033 543705 net.go:770] primary dev: ETH0
I0320 05:59:43.423046 543705 net.go:802] Send network stats successfully!,count is 6
I0320 05:59:43.423058 543705 net.go:698] Add success.
I0320 05:59:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 05:59:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 05:59:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 05:59:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 05:59:53.409778 543705 cpu.go:275] no items to output this cycle
I0320 05:59:53.409778 543705 memory.go:184] no items to output this cycle
E0320 06:00:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:00:03.409790 543705 memory.go:184] no items to output this cycle
I0320 06:00:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 06:00:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:00:13.409795 543705 memory.go:191] Add success.
I0320 06:00:13.409797 543705 cpu.go:282] Add success.
W0320 06:00:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:00:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:00:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:00:13.419771 543705 net.go:648] Add success.
I0320 06:00:13.422762 543705 net.go:770] primary dev: ETH0
I0320 06:00:13.422782 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:00:13.422796 543705 net.go:698] Add success.
I0320 06:00:13.468770 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1001c76c-e783-4d14-9432-118425b75764","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 06:00:13.468802 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 06:00:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:00:14.455185 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:00:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 06:00:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:00:14.456614 543705 disk_worker.go:494] system disk:vda1
I0320 06:00:14.456644 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:00:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:00:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:00:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:00:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:00:16.472420 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:00:22.885673 543705 disk_info.go:125] begin check local disk info of client
I0320 06:00:22.888125 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:00:22.888131 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4c00 0xc0000c4c40]
E0320 06:00:23.407526 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:00:23.407538 543705 memory.go:184] no items to output this cycle
I0320 06:00:23.407546 543705 cpu.go:275] no items to output this cycle
E0320 06:00:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:00:33.409806 543705 memory.go:184] no items to output this cycle
I0320 06:00:33.409814 543705 cpu.go:275] no items to output this cycle
I0320 06:00:38.169738 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 06:00:38.169745 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 06:00:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:00:43.410911 543705 memory.go:191] Add success.
I0320 06:00:43.409787 543705 cpu.go:282] Add success.
I0320 06:00:43.420658 543705 net.go:648] Add success.
I0320 06:00:43.423461 543705 net.go:770] primary dev: ETH0
I0320 06:00:43.423474 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:00:43.423488 543705 net.go:698] Add success.
I0320 06:00:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:00:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:00:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:00:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:00:53.409778 543705 memory.go:184] no items to output this cycle
I0320 06:00:53.409782 543705 cpu.go:275] no items to output this cycle
E0320 06:01:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:01:03.409788 543705 memory.go:184] no items to output this cycle
I0320 06:01:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 06:01:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:01:13.409783 543705 memory.go:191] Add success.
I0320 06:01:13.409801 543705 cpu.go:282] Add success.
W0320 06:01:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:01:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:01:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:01:13.420550 543705 net.go:648] Add success.
I0320 06:01:13.423217 543705 net.go:770] primary dev: ETH0
I0320 06:01:13.423229 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:01:13.423240 543705 net.go:698] Add success.
I0320 06:01:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:01:14.455146 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:01:14.455157 543705 disk_worker.go:708] disk space is not compliant
W0320 06:01:14.455160 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:01:14.456494 543705 disk_worker.go:494] system disk:vda1
I0320 06:01:14.456539 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:01:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:01:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:01:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:01:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:01:16.472410 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:01:22.889676 543705 disk_info.go:125] begin check local disk info of client
I0320 06:01:22.892316 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:01:22.892324 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003904c0 0xc000390500]
E0320 06:01:23.408974 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:01:23.408984 543705 cpu.go:275] no items to output this cycle
I0320 06:01:23.408988 543705 memory.go:184] no items to output this cycle
E0320 06:01:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:01:33.409766 543705 memory.go:184] no items to output this cycle
I0320 06:01:33.409805 543705 cpu.go:275] no items to output this cycle
E0320 06:01:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:01:43.409785 543705 memory.go:191] Add success.
I0320 06:01:43.409804 543705 cpu.go:282] Add success.
I0320 06:01:43.419881 543705 net.go:648] Add success.
I0320 06:01:43.422576 543705 net.go:770] primary dev: ETH0
I0320 06:01:43.422589 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:01:43.422601 543705 net.go:698] Add success.
I0320 06:01:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:01:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:01:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:01:53.410267 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:01:53.410286 543705 memory.go:184] no items to output this cycle
I0320 06:01:53.410289 543705 cpu.go:275] no items to output this cycle
E0320 06:02:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:02:03.409798 543705 memory.go:184] no items to output this cycle
I0320 06:02:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 06:02:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:02:13.409797 543705 cpu.go:282] Add success.
I0320 06:02:13.409808 543705 memory.go:191] Add success.
W0320 06:02:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:02:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:02:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:02:13.420139 543705 net.go:648] Add success.
I0320 06:02:13.422650 543705 net.go:770] primary dev: ETH0
I0320 06:02:13.422664 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:02:13.422675 543705 net.go:698] Add success.
W0320 06:02:14.455141 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:02:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0320 06:02:14.455208 543705 disk_worker.go:728] disk inode is not compliant
E0320 06:02:14.455929 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 06:02:14.455938 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 06:02:14.455943 543705 custom_config.go:64] query custom config with name: gpu
I0320 06:02:14.456550 543705 disk_worker.go:494] system disk:vda1
I0320 06:02:14.456579 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 06:02:15.456811 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 06:02:15.456819 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:02:16.457909 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 06:02:16.457909 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 06:02:16.457966 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:02:16.457985 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:02:16.472397 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:02:22.893672 543705 disk_info.go:125] begin check local disk info of client
I0320 06:02:22.896070 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:02:22.896076 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034e180 0xc00034e1c0]
E0320 06:02:23.407510 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:02:23.407524 543705 memory.go:184] no items to output this cycle
I0320 06:02:23.407556 543705 cpu.go:275] no items to output this cycle
E0320 06:02:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:02:33.409773 543705 memory.go:184] no items to output this cycle
I0320 06:02:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 06:02:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:02:43.409811 543705 memory.go:191] Add success.
I0320 06:02:43.409817 543705 cpu.go:282] Add success.
I0320 06:02:43.419870 543705 net.go:648] Add success.
I0320 06:02:43.422585 543705 net.go:770] primary dev: ETH0
I0320 06:02:43.422598 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:02:43.422638 543705 net.go:698] Add success.
I0320 06:02:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:02:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:02:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:02:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:02:53.409793 543705 memory.go:184] no items to output this cycle
I0320 06:02:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 06:03:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:03:03.409780 543705 memory.go:184] no items to output this cycle
I0320 06:03:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 06:03:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:03:13.409818 543705 memory.go:191] Add success.
I0320 06:03:13.409828 543705 cpu.go:282] Add success.
W0320 06:03:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:03:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:03:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:03:13.420227 543705 net.go:648] Add success.
I0320 06:03:13.423266 543705 net.go:770] primary dev: ETH0
I0320 06:03:13.423279 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:03:13.423293 543705 net.go:698] Add success.
I0320 06:03:13.647138 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c511bbff-04e6-4726-bb4d-94b2764e41c3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 06:03:13.647175 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 06:03:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:03:14.455224 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:03:14.455234 543705 disk_worker.go:708] disk space is not compliant
W0320 06:03:14.455237 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:03:14.456722 543705 disk_worker.go:494] system disk:vda1
I0320 06:03:14.456750 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:03:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:03:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:03:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:03:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:03:16.472399 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:03:22.897683 543705 disk_info.go:125] begin check local disk info of client
I0320 06:03:22.900236 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:03:22.900244 543705 disk_info.go:196] parse disk info done, disk is : [0xc000384500 0xc000384540]
E0320 06:03:23.408870 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:03:23.408888 543705 memory.go:184] no items to output this cycle
I0320 06:03:23.408901 543705 cpu.go:275] no items to output this cycle
E0320 06:03:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:03:33.409770 543705 memory.go:184] no items to output this cycle
I0320 06:03:33.409802 543705 cpu.go:275] no items to output this cycle
I0320 06:03:38.173563 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 06:03:38.173569 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 06:03:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:03:43.410709 543705 memory.go:191] Add success.
I0320 06:03:43.409787 543705 cpu.go:282] Add success.
I0320 06:03:43.420613 543705 net.go:648] Add success.
I0320 06:03:43.423250 543705 net.go:770] primary dev: ETH0
I0320 06:03:43.423264 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:03:43.423278 543705 net.go:698] Add success.
I0320 06:03:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:03:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:03:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:03:53.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:03:53.409761 543705 memory.go:184] no items to output this cycle
I0320 06:03:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 06:04:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:04:03.409797 543705 memory.go:184] no items to output this cycle
I0320 06:04:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 06:04:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:04:13.409780 543705 memory.go:191] Add success.
W0320 06:04:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 06:04:13.409812 543705 cpu.go:282] Add success.
W0320 06:04:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:04:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:04:13.420148 543705 net.go:648] Add success.
I0320 06:04:13.422905 543705 net.go:770] primary dev: ETH0
I0320 06:04:13.422918 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:04:13.422929 543705 net.go:698] Add success.
I0320 06:04:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:04:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:04:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 06:04:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:04:14.456839 543705 disk_worker.go:494] system disk:vda1
I0320 06:04:14.456868 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:04:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:04:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:04:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:04:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:04:16.472425 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:04:22.901686 543705 disk_info.go:125] begin check local disk info of client
I0320 06:04:22.904123 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:04:22.904129 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5240 0xc0000c5280]
E0320 06:04:23.407506 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:04:23.407520 543705 memory.go:184] no items to output this cycle
I0320 06:04:23.407553 543705 cpu.go:275] no items to output this cycle
E0320 06:04:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:04:33.409787 543705 cpu.go:275] no items to output this cycle
I0320 06:04:33.409792 543705 memory.go:184] no items to output this cycle
E0320 06:04:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:04:43.409812 543705 memory.go:191] Add success.
I0320 06:04:43.409819 543705 cpu.go:282] Add success.
I0320 06:04:43.419884 543705 net.go:648] Add success.
I0320 06:04:43.422497 543705 net.go:770] primary dev: ETH0
I0320 06:04:43.422511 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:04:43.422523 543705 net.go:698] Add success.
I0320 06:04:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:04:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:04:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:04:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:04:53.409769 543705 memory.go:184] no items to output this cycle
I0320 06:04:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 06:05:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:05:03.409796 543705 memory.go:184] no items to output this cycle
I0320 06:05:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 06:05:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:05:13.409817 543705 memory.go:191] Add success.
I0320 06:05:13.409825 543705 cpu.go:282] Add success.
W0320 06:05:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:05:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:05:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:05:13.419953 543705 net.go:770] primary dev: ETH0
I0320 06:05:13.419965 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:05:13.419978 543705 net.go:698] Add success.
I0320 06:05:13.420361 543705 net.go:648] Add success.
I0320 06:05:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:05:14.455099 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:05:14.455163 543705 disk_worker.go:708] disk space is not compliant
W0320 06:05:14.455166 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:05:14.456503 543705 disk_worker.go:494] system disk:vda1
I0320 06:05:14.456550 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:05:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:05:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:05:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:05:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:05:16.472374 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:05:22.905692 543705 disk_info.go:125] begin check local disk info of client
I0320 06:05:22.908245 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:05:22.908253 543705 disk_info.go:196] parse disk info done, disk is : [0xc000330040 0xc000330080]
E0320 06:05:23.408841 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:05:23.408861 543705 memory.go:184] no items to output this cycle
I0320 06:05:23.408874 543705 cpu.go:275] no items to output this cycle
E0320 06:05:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:05:33.409777 543705 memory.go:184] no items to output this cycle
I0320 06:05:33.409782 543705 cpu.go:275] no items to output this cycle
E0320 06:05:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:05:43.409788 543705 memory.go:191] Add success.
I0320 06:05:43.409788 543705 cpu.go:282] Add success.
I0320 06:05:43.420020 543705 net.go:648] Add success.
I0320 06:05:43.422756 543705 net.go:770] primary dev: ETH0
I0320 06:05:43.422770 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:05:43.422782 543705 net.go:698] Add success.
I0320 06:05:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:05:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:05:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:05:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:05:53.409780 543705 memory.go:184] no items to output this cycle
I0320 06:05:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 06:06:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:06:03.409787 543705 memory.go:184] no items to output this cycle
I0320 06:06:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 06:06:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:06:13.409787 543705 memory.go:191] Add success.
I0320 06:06:13.409806 543705 cpu.go:282] Add success.
W0320 06:06:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:06:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:06:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:06:13.420042 543705 net.go:648] Add success.
I0320 06:06:13.422901 543705 net.go:770] primary dev: ETH0
I0320 06:06:13.422914 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:06:13.422927 543705 net.go:698] Add success.
I0320 06:06:13.468986 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ba8f4fcb-e0cd-4544-964f-22a093e8456d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 06:06:13.469025 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 06:06:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:06:14.455132 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:06:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0320 06:06:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:06:14.456592 543705 disk_worker.go:494] system disk:vda1
I0320 06:06:14.456623 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:06:15.456009 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:06:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:06:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:06:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:06:16.472409 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:06:22.909671 543705 disk_info.go:125] begin check local disk info of client
I0320 06:06:22.912100 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:06:22.912106 543705 disk_info.go:196] parse disk info done, disk is : [0xc000492080 0xc0004920c0]
E0320 06:06:23.407527 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:06:23.407541 543705 memory.go:184] no items to output this cycle
I0320 06:06:23.407550 543705 cpu.go:275] no items to output this cycle
E0320 06:06:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:06:33.409780 543705 cpu.go:275] no items to output this cycle
I0320 06:06:33.409780 543705 memory.go:184] no items to output this cycle
I0320 06:06:38.173735 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 06:06:38.173741 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 06:06:43.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:06:43.410673 543705 memory.go:191] Add success.
I0320 06:06:43.409806 543705 cpu.go:282] Add success.
I0320 06:06:43.420304 543705 net.go:648] Add success.
I0320 06:06:43.422897 543705 net.go:770] primary dev: ETH0
I0320 06:06:43.422912 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:06:43.422926 543705 net.go:698] Add success.
I0320 06:06:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:06:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:06:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:06:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:06:53.409794 543705 memory.go:184] no items to output this cycle
I0320 06:06:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 06:07:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:07:03.409776 543705 memory.go:184] no items to output this cycle
I0320 06:07:03.409845 543705 cpu.go:275] no items to output this cycle
E0320 06:07:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:07:13.409810 543705 memory.go:191] Add success.
I0320 06:07:13.409829 543705 cpu.go:282] Add success.
W0320 06:07:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:07:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:07:13.409859 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:07:13.420107 543705 net.go:648] Add success.
I0320 06:07:13.422881 543705 net.go:770] primary dev: ETH0
I0320 06:07:13.422894 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:07:13.422907 543705 net.go:698] Add success.
I0320 06:07:13.453452 543705 event_worker.go:152] Polling the log file for events...
W0320 06:07:14.455116 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:07:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0320 06:07:14.455179 543705 disk_worker.go:728] disk inode is not compliant
E0320 06:07:14.455916 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 06:07:14.455925 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 06:07:14.455931 543705 custom_config.go:64] query custom config with name: gpu
I0320 06:07:14.456553 543705 disk_worker.go:494] system disk:vda1
I0320 06:07:14.456586 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 06:07:15.456868 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 06:07:15.456877 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:07:16.457932 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 06:07:16.457941 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 06:07:16.457986 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:07:16.458002 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:07:16.472351 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:07:22.913683 543705 disk_info.go:125] begin check local disk info of client
I0320 06:07:22.916242 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:07:22.916249 543705 disk_info.go:196] parse disk info done, disk is : [0xc000484000 0xc000484040]
E0320 06:07:23.408800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:07:23.408818 543705 memory.go:184] no items to output this cycle
I0320 06:07:23.408833 543705 cpu.go:275] no items to output this cycle
E0320 06:07:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:07:33.409774 543705 memory.go:184] no items to output this cycle
I0320 06:07:33.409780 543705 cpu.go:275] no items to output this cycle
E0320 06:07:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:07:43.409784 543705 cpu.go:282] Add success.
I0320 06:07:43.409790 543705 memory.go:191] Add success.
I0320 06:07:43.420046 543705 net.go:648] Add success.
I0320 06:07:43.422900 543705 net.go:770] primary dev: ETH0
I0320 06:07:43.422929 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:07:43.422943 543705 net.go:698] Add success.
I0320 06:07:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:07:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:07:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:07:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:07:53.409769 543705 memory.go:184] no items to output this cycle
I0320 06:07:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 06:08:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:08:03.409783 543705 memory.go:184] no items to output this cycle
I0320 06:08:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 06:08:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:08:13.409811 543705 memory.go:191] Add success.
I0320 06:08:13.409821 543705 cpu.go:282] Add success.
W0320 06:08:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:08:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:08:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:08:13.420171 543705 net.go:648] Add success.
I0320 06:08:13.423304 543705 net.go:770] primary dev: ETH0
I0320 06:08:13.423317 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:08:13.423330 543705 net.go:698] Add success.
I0320 06:08:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:08:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:08:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 06:08:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:08:14.456571 543705 disk_worker.go:494] system disk:vda1
I0320 06:08:14.456600 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:08:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:08:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:08:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:08:16.458049 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:08:16.472357 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:08:22.917668 543705 disk_info.go:125] begin check local disk info of client
I0320 06:08:22.920154 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:08:22.920162 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003846c0 0xc000384700]
E0320 06:08:23.408671 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:08:23.408680 543705 cpu.go:275] no items to output this cycle
I0320 06:08:23.408683 543705 memory.go:184] no items to output this cycle
E0320 06:08:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:08:33.409797 543705 memory.go:184] no items to output this cycle
I0320 06:08:33.409800 543705 cpu.go:275] no items to output this cycle
E0320 06:08:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:08:43.409783 543705 memory.go:191] Add success.
I0320 06:08:43.409784 543705 cpu.go:282] Add success.
I0320 06:08:43.419894 543705 net.go:648] Add success.
I0320 06:08:43.422509 543705 net.go:770] primary dev: ETH0
I0320 06:08:43.422524 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:08:43.422539 543705 net.go:698] Add success.
I0320 06:08:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:08:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:08:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:08:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:08:53.409779 543705 memory.go:184] no items to output this cycle
I0320 06:08:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 06:09:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:09:03.409797 543705 memory.go:184] no items to output this cycle
I0320 06:09:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 06:09:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:09:13.409806 543705 memory.go:191] Add success.
I0320 06:09:13.409813 543705 cpu.go:282] Add success.
W0320 06:09:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:09:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:09:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:09:13.420090 543705 net.go:648] Add success.
I0320 06:09:13.422913 543705 net.go:770] primary dev: ETH0
I0320 06:09:13.422926 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:09:13.422937 543705 net.go:698] Add success.
I0320 06:09:13.468347 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0ed36070-040c-4b92-ad66-132967d63014","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 06:09:13.468380 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 06:09:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:09:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:09:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 06:09:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:09:14.456576 543705 disk_worker.go:494] system disk:vda1
I0320 06:09:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:09:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:09:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:09:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:09:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:09:16.472399 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:09:22.921681 543705 disk_info.go:125] begin check local disk info of client
I0320 06:09:22.924314 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:09:22.924322 543705 disk_info.go:196] parse disk info done, disk is : [0xc00033f140 0xc00033f180]
E0320 06:09:23.408791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:09:23.408808 543705 memory.go:184] no items to output this cycle
I0320 06:09:23.408841 543705 cpu.go:275] no items to output this cycle
E0320 06:09:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:09:33.409809 543705 memory.go:184] no items to output this cycle
I0320 06:09:33.409822 543705 cpu.go:275] no items to output this cycle
I0320 06:09:38.173878 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 06:09:38.173884 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 06:09:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:09:43.410783 543705 memory.go:191] Add success.
I0320 06:09:43.409786 543705 cpu.go:282] Add success.
I0320 06:09:43.420314 543705 net.go:770] primary dev: ETH0
I0320 06:09:43.420327 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:09:43.420340 543705 net.go:698] Add success.
I0320 06:09:43.420713 543705 net.go:648] Add success.
I0320 06:09:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:09:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:09:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:09:53.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:09:53.409758 543705 memory.go:184] no items to output this cycle
I0320 06:09:53.409784 543705 cpu.go:275] no items to output this cycle
E0320 06:10:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:10:03.409810 543705 memory.go:184] no items to output this cycle
I0320 06:10:03.409963 543705 cpu.go:275] no items to output this cycle
E0320 06:10:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:10:13.409780 543705 memory.go:191] Add success.
I0320 06:10:13.409800 543705 cpu.go:282] Add success.
W0320 06:10:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:10:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:10:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:10:13.420139 543705 net.go:648] Add success.
I0320 06:10:13.422699 543705 net.go:770] primary dev: ETH0
I0320 06:10:13.422712 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:10:13.422723 543705 net.go:698] Add success.
I0320 06:10:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:10:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:10:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 06:10:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:10:14.456590 543705 disk_worker.go:494] system disk:vda1
I0320 06:10:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:10:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:10:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:10:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:10:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:10:16.472380 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:10:22.925676 543705 disk_info.go:125] begin check local disk info of client
I0320 06:10:22.928128 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:10:22.928133 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a0580 0xc0002a05c0]
E0320 06:10:23.408624 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:10:23.408640 543705 memory.go:184] no items to output this cycle
I0320 06:10:23.408656 543705 cpu.go:275] no items to output this cycle
E0320 06:10:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:10:33.409792 543705 memory.go:184] no items to output this cycle
I0320 06:10:33.409797 543705 cpu.go:275] no items to output this cycle
E0320 06:10:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:10:43.409780 543705 memory.go:191] Add success.
I0320 06:10:43.409810 543705 cpu.go:282] Add success.
I0320 06:10:43.420002 543705 net.go:648] Add success.
I0320 06:10:43.422529 543705 net.go:770] primary dev: ETH0
I0320 06:10:43.422545 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:10:43.422559 543705 net.go:698] Add success.
I0320 06:10:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:10:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:10:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:10:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:10:53.409767 543705 memory.go:184] no items to output this cycle
I0320 06:10:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 06:11:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:11:03.409815 543705 memory.go:184] no items to output this cycle
I0320 06:11:03.409825 543705 cpu.go:275] no items to output this cycle
E0320 06:11:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:11:13.409784 543705 memory.go:191] Add success.
I0320 06:11:13.409803 543705 cpu.go:282] Add success.
W0320 06:11:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:11:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:11:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:11:13.420198 543705 net.go:648] Add success.
I0320 06:11:13.423321 543705 net.go:770] primary dev: ETH0
I0320 06:11:13.423334 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:11:13.423345 543705 net.go:698] Add success.
I0320 06:11:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:11:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:11:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0320 06:11:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:11:14.456593 543705 disk_worker.go:494] system disk:vda1
I0320 06:11:14.456629 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:11:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:11:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:11:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:11:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:11:16.472410 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:11:22.929677 543705 disk_info.go:125] begin check local disk info of client
I0320 06:11:22.932240 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:11:22.932247 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004e4ac0 0xc0004e4b00]
E0320 06:11:23.408710 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:11:23.408729 543705 memory.go:184] no items to output this cycle
I0320 06:11:23.408742 543705 cpu.go:275] no items to output this cycle
E0320 06:11:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:11:33.409782 543705 memory.go:184] no items to output this cycle
I0320 06:11:33.409781 543705 cpu.go:275] no items to output this cycle
E0320 06:11:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:11:43.409782 543705 memory.go:191] Add success.
I0320 06:11:43.409800 543705 cpu.go:282] Add success.
I0320 06:11:43.419959 543705 net.go:648] Add success.
I0320 06:11:43.422558 543705 net.go:770] primary dev: ETH0
I0320 06:11:43.422571 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:11:43.422583 543705 net.go:698] Add success.
I0320 06:11:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:11:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:11:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:11:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:11:53.409762 543705 memory.go:184] no items to output this cycle
I0320 06:11:53.409783 543705 cpu.go:275] no items to output this cycle
I0320 06:12:03.409808 543705 cpu.go:275] no items to output this cycle
E0320 06:12:03.409810 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:12:03.409829 543705 memory.go:184] no items to output this cycle
E0320 06:12:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:12:13.409795 543705 memory.go:191] Add success.
I0320 06:12:13.409796 543705 cpu.go:282] Add success.
W0320 06:12:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:12:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:12:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:12:13.420139 543705 net.go:648] Add success.
I0320 06:12:13.423039 543705 net.go:770] primary dev: ETH0
I0320 06:12:13.423051 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:12:13.423064 543705 net.go:698] Add success.
I0320 06:12:13.551311 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a201e6aa-4253-421f-b224-70cab2828e11","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 06:12:13.551346 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 06:12:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:12:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 06:12:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:12:14.456834 543705 disk_worker.go:494] system disk:vda1
E0320 06:12:14.456851 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 06:12:14.456859 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 06:12:14.456865 543705 custom_config.go:64] query custom config with name: gpu
I0320 06:12:14.456888 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 06:12:15.456795 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 06:12:15.456804 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:12:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 06:12:16.457969 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 06:12:16.458025 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:12:16.458044 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:12:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:12:22.933676 543705 disk_info.go:125] begin check local disk info of client
I0320 06:12:22.936074 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:12:22.936080 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001c7300 0xc0001c7340]
E0320 06:12:23.407530 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:12:23.407546 543705 memory.go:184] no items to output this cycle
I0320 06:12:23.407561 543705 cpu.go:275] no items to output this cycle
E0320 06:12:33.409838 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:12:33.409860 543705 memory.go:184] no items to output this cycle
I0320 06:12:33.409934 543705 cpu.go:275] no items to output this cycle
I0320 06:12:38.174025 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 06:12:38.174031 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 06:12:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:12:43.410688 543705 memory.go:191] Add success.
I0320 06:12:43.409810 543705 cpu.go:282] Add success.
I0320 06:12:43.420439 543705 net.go:648] Add success.
I0320 06:12:43.423185 543705 net.go:770] primary dev: ETH0
I0320 06:12:43.423200 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:12:43.423214 543705 net.go:698] Add success.
I0320 06:12:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:12:46.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:12:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:12:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:12:53.409778 543705 cpu.go:275] no items to output this cycle
I0320 06:12:53.409788 543705 memory.go:184] no items to output this cycle
E0320 06:13:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:13:03.409814 543705 memory.go:184] no items to output this cycle
I0320 06:13:03.409828 543705 cpu.go:275] no items to output this cycle
E0320 06:13:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:13:13.409798 543705 memory.go:191] Add success.
I0320 06:13:13.409798 543705 cpu.go:282] Add success.
W0320 06:13:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:13:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:13:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:13:13.420120 543705 net.go:648] Add success.
I0320 06:13:13.422711 543705 net.go:770] primary dev: ETH0
I0320 06:13:13.422724 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:13:13.422737 543705 net.go:698] Add success.
I0320 06:13:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:13:14.455116 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:13:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0320 06:13:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:13:14.456599 543705 disk_worker.go:494] system disk:vda1
I0320 06:13:14.456629 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:13:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:13:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:13:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:13:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:13:16.472390 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:13:22.937684 543705 disk_info.go:125] begin check local disk info of client
I0320 06:13:22.940217 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:13:22.940224 543705 disk_info.go:196] parse disk info done, disk is : [0xc00025c440 0xc00025c480]
E0320 06:13:23.408655 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:13:23.408672 543705 memory.go:184] no items to output this cycle
I0320 06:13:23.408686 543705 cpu.go:275] no items to output this cycle
E0320 06:13:33.409902 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:13:33.409912 543705 cpu.go:275] no items to output this cycle
I0320 06:13:33.409920 543705 memory.go:184] no items to output this cycle
E0320 06:13:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:13:43.409797 543705 memory.go:191] Add success.
I0320 06:13:43.409801 543705 cpu.go:282] Add success.
I0320 06:13:43.420084 543705 net.go:648] Add success.
I0320 06:13:43.422933 543705 net.go:770] primary dev: ETH0
I0320 06:13:43.422945 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:13:43.422957 543705 net.go:698] Add success.
I0320 06:13:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:13:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:13:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:13:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:13:53.409771 543705 memory.go:184] no items to output this cycle
I0320 06:13:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 06:14:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:14:03.409813 543705 memory.go:184] no items to output this cycle
I0320 06:14:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 06:14:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:14:13.409789 543705 memory.go:191] Add success.
I0320 06:14:13.409812 543705 cpu.go:282] Add success.
W0320 06:14:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:14:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:14:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:14:13.420144 543705 net.go:648] Add success.
I0320 06:14:13.422766 543705 net.go:770] primary dev: ETH0
I0320 06:14:13.422781 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:14:13.422794 543705 net.go:698] Add success.
I0320 06:14:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:14:14.455153 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:14:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0320 06:14:14.455165 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:14:14.456497 543705 disk_worker.go:494] system disk:vda1
I0320 06:14:14.456540 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:14:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:14:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:14:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:14:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:14:16.472418 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:14:22.941673 543705 disk_info.go:125] begin check local disk info of client
I0320 06:14:22.944097 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:14:22.944103 543705 disk_info.go:196] parse disk info done, disk is : [0xc000466340 0xc000466380]
E0320 06:14:23.408512 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:14:23.408529 543705 memory.go:184] no items to output this cycle
I0320 06:14:23.408542 543705 cpu.go:275] no items to output this cycle
E0320 06:14:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:14:33.409787 543705 memory.go:184] no items to output this cycle
I0320 06:14:33.409794 543705 cpu.go:275] no items to output this cycle
E0320 06:14:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:14:43.409779 543705 memory.go:191] Add success.
I0320 06:14:43.409806 543705 cpu.go:282] Add success.
I0320 06:14:43.419902 543705 net.go:648] Add success.
I0320 06:14:43.422770 543705 net.go:770] primary dev: ETH0
I0320 06:14:43.422782 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:14:43.422796 543705 net.go:698] Add success.
I0320 06:14:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:14:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:14:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:14:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:14:53.409793 543705 memory.go:184] no items to output this cycle
I0320 06:14:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 06:15:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:15:03.409809 543705 memory.go:184] no items to output this cycle
I0320 06:15:03.409822 543705 cpu.go:275] no items to output this cycle
E0320 06:15:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:15:13.409799 543705 cpu.go:282] Add success.
I0320 06:15:13.409802 543705 memory.go:191] Add success.
W0320 06:15:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:15:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:15:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:15:13.420226 543705 net.go:648] Add success.
I0320 06:15:13.423206 543705 net.go:770] primary dev: ETH0
I0320 06:15:13.423221 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:15:13.423235 543705 net.go:698] Add success.
I0320 06:15:13.527931 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b6ac6934-bd64-4c97-afa9-737f656fa755","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 06:15:13.527965 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 06:15:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:15:14.455153 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:15:14.455163 543705 disk_worker.go:708] disk space is not compliant
W0320 06:15:14.455166 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:15:14.456542 543705 disk_worker.go:494] system disk:vda1
I0320 06:15:14.456582 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:15:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:15:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:15:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:15:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:15:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:15:22.945688 543705 disk_info.go:125] begin check local disk info of client
I0320 06:15:22.948270 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:15:22.948276 543705 disk_info.go:196] parse disk info done, disk is : [0xc00028a140 0xc00028a180]
E0320 06:15:23.408659 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:15:23.408678 543705 memory.go:184] no items to output this cycle
I0320 06:15:23.408691 543705 cpu.go:275] no items to output this cycle
E0320 06:15:33.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:15:33.409761 543705 memory.go:184] no items to output this cycle
I0320 06:15:33.409793 543705 cpu.go:275] no items to output this cycle
I0320 06:15:38.174171 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 06:15:38.174177 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 06:15:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:15:43.410571 543705 memory.go:191] Add success.
I0320 06:15:43.409795 543705 cpu.go:282] Add success.
I0320 06:15:43.420376 543705 net.go:648] Add success.
I0320 06:15:43.422976 543705 net.go:770] primary dev: ETH0
I0320 06:15:43.422988 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:15:43.423000 543705 net.go:698] Add success.
I0320 06:15:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:15:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:15:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:15:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:15:53.409774 543705 memory.go:184] no items to output this cycle
I0320 06:15:53.409782 543705 cpu.go:275] no items to output this cycle
I0320 06:16:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 06:16:03.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:16:03.409807 543705 memory.go:184] no items to output this cycle
E0320 06:16:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:16:13.409819 543705 memory.go:191] Add success.
I0320 06:16:13.409825 543705 cpu.go:282] Add success.
W0320 06:16:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:16:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:16:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:16:13.420129 543705 net.go:648] Add success.
I0320 06:16:13.423130 543705 net.go:770] primary dev: ETH0
I0320 06:16:13.423142 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:16:13.423154 543705 net.go:698] Add success.
I0320 06:16:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:16:14.455097 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:16:14.455160 543705 disk_worker.go:708] disk space is not compliant
W0320 06:16:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:16:14.456503 543705 disk_worker.go:494] system disk:vda1
I0320 06:16:14.456531 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:16:15.455949 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:16:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:16:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:16:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:16:16.472408 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:16:22.949674 543705 disk_info.go:125] begin check local disk info of client
I0320 06:16:22.952106 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:16:22.952112 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039a440 0xc00039a480]
E0320 06:16:23.407860 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:16:23.407864 543705 cpu.go:275] no items to output this cycle
I0320 06:16:23.407872 543705 memory.go:184] no items to output this cycle
E0320 06:16:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:16:33.409804 543705 memory.go:184] no items to output this cycle
I0320 06:16:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 06:16:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:16:43.409779 543705 memory.go:191] Add success.
I0320 06:16:43.409797 543705 cpu.go:282] Add success.
I0320 06:16:43.419859 543705 net.go:648] Add success.
I0320 06:16:43.422652 543705 net.go:770] primary dev: ETH0
I0320 06:16:43.422665 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:16:43.422678 543705 net.go:698] Add success.
I0320 06:16:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:16:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:16:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:16:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:16:53.409798 543705 memory.go:184] no items to output this cycle
I0320 06:16:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 06:17:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:17:03.409777 543705 memory.go:184] no items to output this cycle
I0320 06:17:03.409838 543705 cpu.go:275] no items to output this cycle
E0320 06:17:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:17:13.409817 543705 memory.go:191] Add success.
I0320 06:17:13.409824 543705 cpu.go:282] Add success.
W0320 06:17:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:17:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:17:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:17:13.420130 543705 net.go:648] Add success.
I0320 06:17:13.423216 543705 net.go:770] primary dev: ETH0
I0320 06:17:13.423229 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:17:13.423246 543705 net.go:698] Add success.
I0320 06:17:13.452771 543705 event_worker.go:152] Polling the log file for events...
W0320 06:17:14.455143 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:17:14.455154 543705 disk_worker.go:708] disk space is not compliant
W0320 06:17:14.455157 543705 disk_worker.go:728] disk inode is not compliant
E0320 06:17:14.456922 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 06:17:14.456931 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 06:17:14.456937 543705 custom_config.go:64] query custom config with name: gpu
I0320 06:17:14.456992 543705 disk_worker.go:494] system disk:vda1
I0320 06:17:14.457019 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 06:17:15.456838 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 06:17:15.456846 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:17:16.457951 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 06:17:16.457961 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 06:17:16.458006 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:17:16.458024 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:17:16.472368 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:17:22.953686 543705 disk_info.go:125] begin check local disk info of client
I0320 06:17:22.956206 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:17:22.956213 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b2280 0xc0003b22c0]
E0320 06:17:23.408581 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:17:23.408600 543705 memory.go:184] no items to output this cycle
I0320 06:17:23.408614 543705 cpu.go:275] no items to output this cycle
E0320 06:17:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:17:33.409769 543705 memory.go:184] no items to output this cycle
I0320 06:17:33.409790 543705 cpu.go:275] no items to output this cycle
E0320 06:17:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:17:43.409813 543705 memory.go:191] Add success.
I0320 06:17:43.409814 543705 cpu.go:282] Add success.
I0320 06:17:43.419953 543705 net.go:648] Add success.
I0320 06:17:43.422824 543705 net.go:770] primary dev: ETH0
I0320 06:17:43.422841 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:17:43.422856 543705 net.go:698] Add success.
I0320 06:17:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:17:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:17:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:17:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:17:53.409773 543705 memory.go:184] no items to output this cycle
I0320 06:17:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 06:18:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:18:03.409779 543705 memory.go:184] no items to output this cycle
I0320 06:18:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 06:18:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:18:13.409803 543705 memory.go:191] Add success.
I0320 06:18:13.409803 543705 cpu.go:282] Add success.
W0320 06:18:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:18:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:18:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:18:13.419733 543705 net.go:648] Add success.
I0320 06:18:13.422236 543705 net.go:770] primary dev: ETH0
I0320 06:18:13.422249 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:18:13.422259 543705 net.go:698] Add success.
I0320 06:18:13.468734 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"de403176-931e-4443-a3ba-b56d510934ee","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 06:18:13.468764 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 06:18:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:18:14.455099 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:18:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 06:18:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:18:14.456501 543705 disk_worker.go:494] system disk:vda1
I0320 06:18:14.456545 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:18:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:18:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:18:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:18:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:18:16.472394 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:18:22.957675 543705 disk_info.go:125] begin check local disk info of client
I0320 06:18:22.960065 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:18:22.960072 543705 disk_info.go:196] parse disk info done, disk is : [0xc000256800 0xc000256840]
E0320 06:18:23.408396 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:18:23.408412 543705 memory.go:184] no items to output this cycle
I0320 06:18:23.408425 543705 cpu.go:275] no items to output this cycle
E0320 06:18:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:18:33.409789 543705 cpu.go:275] no items to output this cycle
I0320 06:18:33.409793 543705 memory.go:184] no items to output this cycle
I0320 06:18:38.177577 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 06:18:38.177584 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 06:18:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:18:43.410856 543705 memory.go:191] Add success.
I0320 06:18:43.409822 543705 cpu.go:282] Add success.
I0320 06:18:43.420615 543705 net.go:648] Add success.
I0320 06:18:43.423393 543705 net.go:770] primary dev: ETH0
I0320 06:18:43.423407 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:18:43.423419 543705 net.go:698] Add success.
I0320 06:18:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:18:46.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:18:46.458051 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:18:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:18:53.409786 543705 memory.go:184] no items to output this cycle
I0320 06:18:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 06:19:03.409837 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:19:03.409857 543705 memory.go:184] no items to output this cycle
I0320 06:19:03.409929 543705 cpu.go:275] no items to output this cycle
E0320 06:19:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:19:13.409808 543705 memory.go:191] Add success.
I0320 06:19:13.409809 543705 cpu.go:282] Add success.
W0320 06:19:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:19:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:19:13.409853 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:19:13.420186 543705 net.go:648] Add success.
I0320 06:19:13.422917 543705 net.go:770] primary dev: ETH0
I0320 06:19:13.422931 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:19:13.422945 543705 net.go:698] Add success.
I0320 06:19:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:19:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:19:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 06:19:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:19:14.456598 543705 disk_worker.go:494] system disk:vda1
I0320 06:19:14.456629 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:19:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:19:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:19:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:19:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:19:16.472419 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:19:22.961683 543705 disk_info.go:125] begin check local disk info of client
I0320 06:19:22.964192 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:19:22.964199 543705 disk_info.go:196] parse disk info done, disk is : [0xc000475680 0xc0004756c0]
E0320 06:19:23.407536 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:19:23.407553 543705 memory.go:184] no items to output this cycle
I0320 06:19:23.407569 543705 cpu.go:275] no items to output this cycle
E0320 06:19:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:19:33.409804 543705 memory.go:184] no items to output this cycle
I0320 06:19:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 06:19:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:19:43.409796 543705 memory.go:191] Add success.
I0320 06:19:43.409805 543705 cpu.go:282] Add success.
I0320 06:19:43.420043 543705 net.go:648] Add success.
I0320 06:19:43.422641 543705 net.go:770] primary dev: ETH0
I0320 06:19:43.422658 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:19:43.422672 543705 net.go:698] Add success.
I0320 06:19:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:19:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:19:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:19:53.409884 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:19:53.409906 543705 memory.go:184] no items to output this cycle
I0320 06:19:53.409907 543705 cpu.go:275] no items to output this cycle
E0320 06:20:03.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:20:03.409761 543705 memory.go:184] no items to output this cycle
I0320 06:20:03.409806 543705 cpu.go:275] no items to output this cycle
E0320 06:20:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:20:13.409824 543705 memory.go:191] Add success.
I0320 06:20:13.409829 543705 cpu.go:282] Add success.
W0320 06:20:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:20:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:20:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:20:13.420206 543705 net.go:648] Add success.
I0320 06:20:13.422804 543705 net.go:770] primary dev: ETH0
I0320 06:20:13.422818 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:20:13.422832 543705 net.go:698] Add success.
I0320 06:20:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:20:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:20:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 06:20:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:20:14.456587 543705 disk_worker.go:494] system disk:vda1
I0320 06:20:14.456617 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:20:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:20:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:20:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:20:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:20:16.472416 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:20:22.965671 543705 disk_info.go:125] begin check local disk info of client
I0320 06:20:22.968103 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:20:22.968108 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4680 0xc0000c46c0]
E0320 06:20:23.408402 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:20:23.408418 543705 memory.go:184] no items to output this cycle
I0320 06:20:23.408433 543705 cpu.go:275] no items to output this cycle
E0320 06:20:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:20:33.409782 543705 memory.go:184] no items to output this cycle
I0320 06:20:33.409786 543705 cpu.go:275] no items to output this cycle
E0320 06:20:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:20:43.409788 543705 memory.go:191] Add success.
I0320 06:20:43.409804 543705 cpu.go:282] Add success.
I0320 06:20:43.420003 543705 net.go:648] Add success.
I0320 06:20:43.423022 543705 net.go:770] primary dev: ETH0
I0320 06:20:43.423037 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:20:43.423088 543705 net.go:698] Add success.
I0320 06:20:46.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:20:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:20:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:20:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:20:53.409783 543705 memory.go:184] no items to output this cycle
I0320 06:20:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 06:21:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:21:03.409775 543705 memory.go:184] no items to output this cycle
I0320 06:21:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 06:21:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:21:13.409803 543705 memory.go:191] Add success.
I0320 06:21:13.409802 543705 cpu.go:282] Add success.
W0320 06:21:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:21:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:21:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:21:13.420046 543705 net.go:648] Add success.
I0320 06:21:13.423095 543705 net.go:770] primary dev: ETH0
I0320 06:21:13.423108 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:21:13.423120 543705 net.go:698] Add success.
I0320 06:21:13.464318 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2de38162-f71d-4977-ae10-c5369915936d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 06:21:13.464353 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 06:21:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:21:14.455193 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:21:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0320 06:21:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:21:14.456598 543705 disk_worker.go:494] system disk:vda1
I0320 06:21:14.456631 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:21:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:21:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:21:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:21:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:21:16.472370 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:21:22.969686 543705 disk_info.go:125] begin check local disk info of client
I0320 06:21:22.972248 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:21:22.972256 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004b5780 0xc0004b57c0]
E0320 06:21:23.408501 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:21:23.408515 543705 memory.go:184] no items to output this cycle
I0320 06:21:23.408550 543705 cpu.go:275] no items to output this cycle
E0320 06:21:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:21:33.409767 543705 memory.go:184] no items to output this cycle
I0320 06:21:33.409799 543705 cpu.go:275] no items to output this cycle
I0320 06:21:38.177729 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 06:21:38.177736 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 06:21:43.409889 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:21:43.410734 543705 memory.go:191] Add success.
I0320 06:21:43.409994 543705 cpu.go:282] Add success.
I0320 06:21:43.419723 543705 net.go:648] Add success.
I0320 06:21:43.422488 543705 net.go:770] primary dev: ETH0
I0320 06:21:43.422501 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:21:43.422513 543705 net.go:698] Add success.
I0320 06:21:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:21:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:21:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:21:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:21:53.409787 543705 memory.go:184] no items to output this cycle
I0320 06:21:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 06:22:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:22:03.409798 543705 memory.go:184] no items to output this cycle
I0320 06:22:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 06:22:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:22:13.409789 543705 memory.go:191] Add success.
I0320 06:22:13.409815 543705 cpu.go:282] Add success.
W0320 06:22:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:22:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:22:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:22:13.420103 543705 net.go:648] Add success.
I0320 06:22:13.422905 543705 net.go:770] primary dev: ETH0
I0320 06:22:13.422918 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:22:13.422931 543705 net.go:698] Add success.
W0320 06:22:14.455100 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:22:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0320 06:22:14.455164 543705 disk_worker.go:728] disk inode is not compliant
E0320 06:22:14.456866 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 06:22:14.456875 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 06:22:14.456881 543705 custom_config.go:64] query custom config with name: gpu
I0320 06:22:14.456952 543705 disk_worker.go:494] system disk:vda1
I0320 06:22:14.456992 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 06:22:15.456814 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 06:22:15.456822 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:22:16.457927 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 06:22:16.457926 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 06:22:16.457979 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:22:16.457998 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:22:16.472362 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:22:22.973675 543705 disk_info.go:125] begin check local disk info of client
I0320 06:22:22.976068 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:22:22.976075 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004b4400 0xc0004b4440]
E0320 06:22:23.408318 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:22:23.408334 543705 memory.go:184] no items to output this cycle
I0320 06:22:23.408351 543705 cpu.go:275] no items to output this cycle
E0320 06:22:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:22:33.409773 543705 memory.go:184] no items to output this cycle
I0320 06:22:33.409800 543705 cpu.go:275] no items to output this cycle
E0320 06:22:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:22:43.409797 543705 memory.go:191] Add success.
I0320 06:22:43.409800 543705 cpu.go:282] Add success.
I0320 06:22:43.419970 543705 net.go:648] Add success.
I0320 06:22:43.422941 543705 net.go:770] primary dev: ETH0
I0320 06:22:43.422955 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:22:43.422969 543705 net.go:698] Add success.
I0320 06:22:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:22:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:22:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:22:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:22:53.409806 543705 memory.go:184] no items to output this cycle
I0320 06:22:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 06:23:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:23:03.409783 543705 memory.go:184] no items to output this cycle
I0320 06:23:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 06:23:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:23:13.409801 543705 cpu.go:282] Add success.
I0320 06:23:13.409814 543705 memory.go:191] Add success.
W0320 06:23:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:23:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:23:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:23:13.420102 543705 net.go:648] Add success.
I0320 06:23:13.423085 543705 net.go:770] primary dev: ETH0
I0320 06:23:13.423101 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:23:13.423115 543705 net.go:698] Add success.
I0320 06:23:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:23:14.455193 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:23:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0320 06:23:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:23:14.456598 543705 disk_worker.go:494] system disk:vda1
I0320 06:23:14.456628 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:23:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:23:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:23:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:23:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:23:16.472383 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:23:22.977685 543705 disk_info.go:125] begin check local disk info of client
I0320 06:23:22.980245 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:23:22.980253 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003147c0 0xc000314800]
E0320 06:23:23.407539 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:23:23.407557 543705 memory.go:184] no items to output this cycle
I0320 06:23:23.407568 543705 cpu.go:275] no items to output this cycle
E0320 06:23:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:23:33.409785 543705 memory.go:184] no items to output this cycle
I0320 06:23:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 06:23:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:23:43.409786 543705 memory.go:191] Add success.
I0320 06:23:43.409805 543705 cpu.go:282] Add success.
I0320 06:23:43.419966 543705 net.go:648] Add success.
I0320 06:23:43.422768 543705 net.go:770] primary dev: ETH0
I0320 06:23:43.422781 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:23:43.422794 543705 net.go:698] Add success.
I0320 06:23:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:23:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:23:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:23:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:23:53.409781 543705 memory.go:184] no items to output this cycle
I0320 06:23:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 06:24:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:24:03.409778 543705 memory.go:184] no items to output this cycle
I0320 06:24:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 06:24:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:24:13.409817 543705 memory.go:191] Add success.
I0320 06:24:13.409824 543705 cpu.go:282] Add success.
W0320 06:24:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:24:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:24:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:24:13.420147 543705 net.go:648] Add success.
I0320 06:24:13.422979 543705 net.go:770] primary dev: ETH0
I0320 06:24:13.422994 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:24:13.423008 543705 net.go:698] Add success.
I0320 06:24:13.469711 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a5583943-f79e-4f95-9ea7-5ba6f1a8edd2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 06:24:13.469745 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 06:24:14.454980 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:24:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:24:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 06:24:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:24:14.456641 543705 disk_worker.go:494] system disk:vda1
I0320 06:24:14.456672 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:24:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:24:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:24:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:24:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:24:16.472388 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:24:22.981672 543705 disk_info.go:125] begin check local disk info of client
I0320 06:24:22.984112 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:24:22.984118 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037e940 0xc00037e980]
E0320 06:24:23.407507 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:24:23.407522 543705 memory.go:184] no items to output this cycle
I0320 06:24:23.407549 543705 cpu.go:275] no items to output this cycle
E0320 06:24:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:24:33.409793 543705 memory.go:184] no items to output this cycle
I0320 06:24:33.409808 543705 cpu.go:275] no items to output this cycle
I0320 06:24:38.181589 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 06:24:38.181596 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 06:24:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:24:43.410609 543705 memory.go:191] Add success.
I0320 06:24:43.409827 543705 cpu.go:282] Add success.
I0320 06:24:43.420319 543705 net.go:648] Add success.
I0320 06:24:43.423375 543705 net.go:770] primary dev: ETH0
I0320 06:24:43.423388 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:24:43.423400 543705 net.go:698] Add success.
I0320 06:24:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:24:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:24:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:24:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:24:53.409788 543705 cpu.go:275] no items to output this cycle
I0320 06:24:53.409790 543705 memory.go:184] no items to output this cycle
E0320 06:25:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:25:03.409777 543705 memory.go:184] no items to output this cycle
I0320 06:25:03.409784 543705 cpu.go:275] no items to output this cycle
E0320 06:25:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:25:13.409783 543705 memory.go:191] Add success.
W0320 06:25:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 06:25:13.409812 543705 cpu.go:282] Add success.
W0320 06:25:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:25:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:25:13.420057 543705 net.go:648] Add success.
I0320 06:25:13.422591 543705 net.go:770] primary dev: ETH0
I0320 06:25:13.422604 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:25:13.422616 543705 net.go:698] Add success.
I0320 06:25:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:25:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:25:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0320 06:25:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:25:14.456602 543705 disk_worker.go:494] system disk:vda1
I0320 06:25:14.456634 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:25:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:25:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:25:16.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:25:16.458079 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:25:16.472403 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:25:22.985683 543705 disk_info.go:125] begin check local disk info of client
I0320 06:25:22.988055 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:25:22.988064 543705 disk_info.go:196] parse disk info done, disk is : [0xc000352400 0xc000352440]
E0320 06:25:23.407551 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:25:23.407567 543705 memory.go:184] no items to output this cycle
I0320 06:25:23.407568 543705 cpu.go:275] no items to output this cycle
E0320 06:25:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:25:33.409780 543705 memory.go:184] no items to output this cycle
I0320 06:25:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 06:25:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:25:43.409814 543705 memory.go:191] Add success.
I0320 06:25:43.409828 543705 cpu.go:282] Add success.
I0320 06:25:43.420035 543705 net.go:648] Add success.
I0320 06:25:43.423173 543705 net.go:770] primary dev: ETH0
I0320 06:25:43.423188 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:25:43.423203 543705 net.go:698] Add success.
I0320 06:25:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:25:46.458066 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:25:46.458093 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:25:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:25:53.409802 543705 memory.go:184] no items to output this cycle
I0320 06:25:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 06:26:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:26:03.409808 543705 memory.go:184] no items to output this cycle
I0320 06:26:03.409822 543705 cpu.go:275] no items to output this cycle
E0320 06:26:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:26:13.409791 543705 memory.go:191] Add success.
I0320 06:26:13.409811 543705 cpu.go:282] Add success.
W0320 06:26:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:26:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:26:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:26:13.420147 543705 net.go:648] Add success.
I0320 06:26:13.422954 543705 net.go:770] primary dev: ETH0
I0320 06:26:13.422970 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:26:13.422984 543705 net.go:698] Add success.
I0320 06:26:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:26:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:26:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 06:26:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:26:14.456554 543705 disk_worker.go:494] system disk:vda1
I0320 06:26:14.456585 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:26:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:26:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:26:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:26:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:26:16.472392 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:26:22.989671 543705 disk_info.go:125] begin check local disk info of client
I0320 06:26:22.992083 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:26:22.992089 543705 disk_info.go:196] parse disk info done, disk is : [0xc00024ee40 0xc00024ee80]
E0320 06:26:23.408339 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:26:23.408358 543705 memory.go:184] no items to output this cycle
I0320 06:26:23.408458 543705 cpu.go:275] no items to output this cycle
E0320 06:26:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:26:33.409790 543705 cpu.go:275] no items to output this cycle
I0320 06:26:33.409795 543705 memory.go:184] no items to output this cycle
E0320 06:26:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:26:43.409811 543705 memory.go:191] Add success.
I0320 06:26:43.409820 543705 cpu.go:282] Add success.
I0320 06:26:43.419960 543705 net.go:648] Add success.
I0320 06:26:43.424034 543705 net.go:770] primary dev: ETH0
I0320 06:26:43.424048 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:26:43.424064 543705 net.go:698] Add success.
I0320 06:26:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:26:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:26:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:26:53.410263 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:26:53.410281 543705 memory.go:184] no items to output this cycle
I0320 06:26:53.410294 543705 cpu.go:275] no items to output this cycle
E0320 06:27:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:27:03.409769 543705 memory.go:184] no items to output this cycle
I0320 06:27:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 06:27:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:27:13.409784 543705 memory.go:191] Add success.
I0320 06:27:13.409805 543705 cpu.go:282] Add success.
W0320 06:27:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:27:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:27:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:27:13.420124 543705 net.go:648] Add success.
I0320 06:27:13.422781 543705 net.go:770] primary dev: ETH0
I0320 06:27:13.422793 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:27:13.422804 543705 net.go:698] Add success.
I0320 06:27:13.428768 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 06:27:13.452935 543705 event_worker.go:152] Polling the log file for events...
I0320 06:27:13.518550 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9b4a7ef8-d135-4bb9-919f-45d03ff8a24f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 06:27:13.518592 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 06:27:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:27:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0320 06:27:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:27:14.456815 543705 disk_worker.go:494] system disk:vda1
E0320 06:27:14.456833 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 06:27:14.456840 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 06:27:14.456845 543705 custom_config.go:64] query custom config with name: gpu
I0320 06:27:14.456860 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 06:27:15.456837 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 06:27:15.456845 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:27:16.457932 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 06:27:16.457941 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 06:27:16.457984 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:27:16.458000 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:27:16.472346 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:27:22.993679 543705 disk_info.go:125] begin check local disk info of client
I0320 06:27:22.996197 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:27:22.996205 543705 disk_info.go:196] parse disk info done, disk is : [0xc000368480 0xc0003684c0]
E0320 06:27:23.408381 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:27:23.408395 543705 cpu.go:275] no items to output this cycle
I0320 06:27:23.408399 543705 memory.go:184] no items to output this cycle
E0320 06:27:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:27:33.409768 543705 memory.go:184] no items to output this cycle
I0320 06:27:33.409790 543705 cpu.go:275] no items to output this cycle
I0320 06:27:38.181739 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 06:27:38.181746 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 06:27:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:27:43.410744 543705 memory.go:191] Add success.
I0320 06:27:43.409830 543705 cpu.go:282] Add success.
I0320 06:27:43.420505 543705 net.go:648] Add success.
I0320 06:27:43.423156 543705 net.go:770] primary dev: ETH0
I0320 06:27:43.423170 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:27:43.423192 543705 net.go:698] Add success.
I0320 06:27:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:27:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:27:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:27:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:27:53.409778 543705 memory.go:184] no items to output this cycle
I0320 06:27:53.409779 543705 cpu.go:275] no items to output this cycle
E0320 06:28:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:28:03.409773 543705 memory.go:184] no items to output this cycle
I0320 06:28:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 06:28:13.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:28:13.409826 543705 memory.go:191] Add success.
I0320 06:28:13.409831 543705 cpu.go:282] Add success.
W0320 06:28:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:28:13.409875 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:28:13.409879 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:28:13.420147 543705 net.go:648] Add success.
I0320 06:28:13.422879 543705 net.go:770] primary dev: ETH0
I0320 06:28:13.422893 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:28:13.422905 543705 net.go:698] Add success.
I0320 06:28:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:28:14.455165 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:28:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0320 06:28:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:28:14.456840 543705 disk_worker.go:494] system disk:vda1
I0320 06:28:14.456870 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:28:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:28:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:28:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:28:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:28:16.472432 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:28:22.997673 543705 disk_info.go:125] begin check local disk info of client
I0320 06:28:23.000118 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:28:23.000125 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047c280 0xc00047c2c0]
E0320 06:28:23.408260 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:28:23.408277 543705 memory.go:184] no items to output this cycle
I0320 06:28:23.408285 543705 cpu.go:275] no items to output this cycle
E0320 06:28:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:28:33.409804 543705 memory.go:184] no items to output this cycle
I0320 06:28:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 06:28:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:28:43.409799 543705 memory.go:191] Add success.
I0320 06:28:43.409799 543705 cpu.go:282] Add success.
I0320 06:28:43.419949 543705 net.go:648] Add success.
I0320 06:28:43.422919 543705 net.go:770] primary dev: ETH0
I0320 06:28:43.422932 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:28:43.422945 543705 net.go:698] Add success.
I0320 06:28:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:28:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:28:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:28:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:28:53.409795 543705 memory.go:184] no items to output this cycle
I0320 06:28:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 06:29:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:29:03.409776 543705 memory.go:184] no items to output this cycle
I0320 06:29:03.409777 543705 cpu.go:275] no items to output this cycle
W0320 06:29:13.409710 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:29:13.409727 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:29:13.409733 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:29:13.409799 543705 cpu.go:282] Add success.
E0320 06:29:13.409833 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:29:13.409856 543705 memory.go:191] Add success.
I0320 06:29:13.420201 543705 net.go:648] Add success.
I0320 06:29:13.422977 543705 net.go:770] primary dev: ETH0
I0320 06:29:13.422991 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:29:13.423003 543705 net.go:698] Add success.
I0320 06:29:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:29:14.455091 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:29:14.455155 543705 disk_worker.go:708] disk space is not compliant
W0320 06:29:14.455158 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:29:14.456526 543705 disk_worker.go:494] system disk:vda1
I0320 06:29:14.456553 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:29:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:29:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:29:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:29:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:29:16.472415 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:29:23.001679 543705 disk_info.go:125] begin check local disk info of client
I0320 06:29:23.003989 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:29:23.003996 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047a280 0xc00047a2c0]
E0320 06:29:23.407558 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:29:23.407580 543705 memory.go:184] no items to output this cycle
I0320 06:29:23.407587 543705 cpu.go:275] no items to output this cycle
E0320 06:29:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:29:33.409802 543705 memory.go:184] no items to output this cycle
I0320 06:29:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 06:29:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:29:43.409783 543705 memory.go:191] Add success.
I0320 06:29:43.409806 543705 cpu.go:282] Add success.
I0320 06:29:43.419886 543705 net.go:648] Add success.
I0320 06:29:43.422660 543705 net.go:770] primary dev: ETH0
I0320 06:29:43.422672 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:29:43.422685 543705 net.go:698] Add success.
I0320 06:29:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:29:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:29:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:29:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:29:53.409763 543705 memory.go:184] no items to output this cycle
I0320 06:29:53.409775 543705 cpu.go:275] no items to output this cycle
E0320 06:30:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:30:03.409768 543705 memory.go:184] no items to output this cycle
I0320 06:30:03.409790 543705 cpu.go:275] no items to output this cycle
E0320 06:30:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:30:13.409815 543705 memory.go:191] Add success.
I0320 06:30:13.409823 543705 cpu.go:282] Add success.
W0320 06:30:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:30:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:30:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:30:13.420106 543705 net.go:648] Add success.
I0320 06:30:13.422932 543705 net.go:770] primary dev: ETH0
I0320 06:30:13.422945 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:30:13.422957 543705 net.go:698] Add success.
I0320 06:30:13.469072 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c79fda38-68f5-4d3c-8145-fe3c2199555a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 06:30:13.469106 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 06:30:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:30:14.455355 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:30:14.455500 543705 disk_worker.go:708] disk space is not compliant
W0320 06:30:14.455506 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:30:14.457095 543705 disk_worker.go:494] system disk:vda1
I0320 06:30:14.457124 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:30:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:30:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:30:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:30:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:30:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:30:23.005674 543705 disk_info.go:125] begin check local disk info of client
I0320 06:30:23.008187 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:30:23.008194 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039a180 0xc00039a1c0]
E0320 06:30:23.408239 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:30:23.408252 543705 memory.go:184] no items to output this cycle
I0320 06:30:23.408288 543705 cpu.go:275] no items to output this cycle
E0320 06:30:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:30:33.409784 543705 memory.go:184] no items to output this cycle
I0320 06:30:33.409797 543705 cpu.go:275] no items to output this cycle
I0320 06:30:38.185611 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 06:30:38.185618 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 06:30:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:30:43.410793 543705 memory.go:191] Add success.
I0320 06:30:43.409789 543705 cpu.go:282] Add success.
I0320 06:30:43.420504 543705 net.go:648] Add success.
I0320 06:30:43.423106 543705 net.go:770] primary dev: ETH0
I0320 06:30:43.423119 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:30:43.423132 543705 net.go:698] Add success.
I0320 06:30:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:30:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:30:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:30:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:30:53.409802 543705 memory.go:184] no items to output this cycle
I0320 06:30:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 06:31:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:31:03.409773 543705 memory.go:184] no items to output this cycle
I0320 06:31:03.409782 543705 cpu.go:275] no items to output this cycle
E0320 06:31:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:31:13.409819 543705 memory.go:191] Add success.
I0320 06:31:13.409824 543705 cpu.go:282] Add success.
W0320 06:31:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:31:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:31:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:31:13.420119 543705 net.go:648] Add success.
I0320 06:31:13.423270 543705 net.go:770] primary dev: ETH0
I0320 06:31:13.423283 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:31:13.423297 543705 net.go:698] Add success.
I0320 06:31:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:31:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:31:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0320 06:31:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:31:14.457156 543705 disk_worker.go:494] system disk:vda1
I0320 06:31:14.457186 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:31:15.455950 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:31:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:31:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:31:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:31:16.472442 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:31:23.009677 543705 disk_info.go:125] begin check local disk info of client
I0320 06:31:23.012183 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:31:23.012191 543705 disk_info.go:196] parse disk info done, disk is : [0xc000261480 0xc0002614c0]
E0320 06:31:23.407570 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:31:23.407589 543705 memory.go:184] no items to output this cycle
I0320 06:31:23.407601 543705 cpu.go:275] no items to output this cycle
E0320 06:31:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:31:33.409800 543705 memory.go:184] no items to output this cycle
I0320 06:31:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 06:31:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:31:43.409792 543705 cpu.go:282] Add success.
I0320 06:31:43.409799 543705 memory.go:191] Add success.
I0320 06:31:43.420065 543705 net.go:648] Add success.
I0320 06:31:43.423149 543705 net.go:770] primary dev: ETH0
I0320 06:31:43.423164 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:31:43.423177 543705 net.go:698] Add success.
I0320 06:31:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:31:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:31:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:31:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:31:53.409792 543705 memory.go:184] no items to output this cycle
I0320 06:31:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 06:32:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:32:03.409778 543705 cpu.go:275] no items to output this cycle
I0320 06:32:03.409786 543705 memory.go:184] no items to output this cycle
E0320 06:32:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:32:13.409823 543705 memory.go:191] Add success.
I0320 06:32:13.409834 543705 cpu.go:282] Add success.
W0320 06:32:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:32:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:32:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:32:13.420200 543705 net.go:648] Add success.
I0320 06:32:13.422981 543705 net.go:770] primary dev: ETH0
I0320 06:32:13.422995 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:32:13.423005 543705 net.go:698] Add success.
W0320 06:32:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:32:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 06:32:14.455199 543705 disk_worker.go:728] disk inode is not compliant
E0320 06:32:14.455904 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 06:32:14.455913 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 06:32:14.455919 543705 custom_config.go:64] query custom config with name: gpu
I0320 06:32:14.456563 543705 disk_worker.go:494] system disk:vda1
I0320 06:32:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 06:32:15.456824 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 06:32:15.456831 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:32:16.457918 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 06:32:16.457920 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 06:32:16.457974 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:32:16.457994 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:32:16.472315 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:32:23.013676 543705 disk_info.go:125] begin check local disk info of client
I0320 06:32:23.016097 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:32:23.016103 543705 disk_info.go:196] parse disk info done, disk is : [0xc000498300 0xc000498340]
E0320 06:32:23.407505 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:32:23.407526 543705 memory.go:184] no items to output this cycle
I0320 06:32:23.407566 543705 cpu.go:275] no items to output this cycle
E0320 06:32:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:32:33.409774 543705 memory.go:184] no items to output this cycle
I0320 06:32:33.409823 543705 cpu.go:275] no items to output this cycle
E0320 06:32:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:32:43.409818 543705 memory.go:191] Add success.
I0320 06:32:43.409824 543705 cpu.go:282] Add success.
I0320 06:32:43.420010 543705 net.go:648] Add success.
I0320 06:32:43.422812 543705 net.go:770] primary dev: ETH0
I0320 06:32:43.422825 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:32:43.422837 543705 net.go:698] Add success.
I0320 06:32:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:32:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:32:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:32:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:32:53.409794 543705 memory.go:184] no items to output this cycle
I0320 06:32:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 06:33:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:33:03.409764 543705 memory.go:184] no items to output this cycle
I0320 06:33:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 06:33:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:33:13.409797 543705 memory.go:191] Add success.
I0320 06:33:13.409797 543705 cpu.go:282] Add success.
W0320 06:33:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:33:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:33:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:33:13.420151 543705 net.go:648] Add success.
I0320 06:33:13.422741 543705 net.go:770] primary dev: ETH0
I0320 06:33:13.422757 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:33:13.422771 543705 net.go:698] Add success.
I0320 06:33:13.601737 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c7d8a1f6-539d-4422-9e1e-31e0ff716165","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 06:33:13.601775 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 06:33:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:33:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:33:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 06:33:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:33:14.456732 543705 disk_worker.go:494] system disk:vda1
I0320 06:33:14.456764 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:33:15.455617 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:33:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:33:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:33:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:33:16.472414 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:33:23.017673 543705 disk_info.go:125] begin check local disk info of client
I0320 06:33:23.020042 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:33:23.020048 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b82c0 0xc0002b8300]
I0320 06:33:23.408101 543705 cpu.go:275] no items to output this cycle
E0320 06:33:23.408118 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:33:23.408137 543705 memory.go:184] no items to output this cycle
E0320 06:33:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:33:33.409784 543705 memory.go:184] no items to output this cycle
I0320 06:33:33.409790 543705 cpu.go:275] no items to output this cycle
I0320 06:33:38.185749 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 06:33:38.185756 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 06:33:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:33:43.410777 543705 memory.go:191] Add success.
I0320 06:33:43.409809 543705 cpu.go:282] Add success.
I0320 06:33:43.420486 543705 net.go:648] Add success.
I0320 06:33:43.423015 543705 net.go:770] primary dev: ETH0
I0320 06:33:43.423028 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:33:43.423043 543705 net.go:698] Add success.
I0320 06:33:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:33:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:33:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:33:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:33:53.409777 543705 memory.go:184] no items to output this cycle
I0320 06:33:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 06:34:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:34:03.409785 543705 cpu.go:275] no items to output this cycle
I0320 06:34:03.409788 543705 memory.go:184] no items to output this cycle
E0320 06:34:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:34:13.409826 543705 memory.go:191] Add success.
I0320 06:34:13.409827 543705 cpu.go:282] Add success.
W0320 06:34:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:34:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:34:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:34:13.420233 543705 net.go:648] Add success.
I0320 06:34:13.423061 543705 net.go:770] primary dev: ETH0
I0320 06:34:13.423076 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:34:13.423089 543705 net.go:698] Add success.
I0320 06:34:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:34:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:34:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 06:34:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:34:14.456569 543705 disk_worker.go:494] system disk:vda1
I0320 06:34:14.456602 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:34:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:34:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:34:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:34:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:34:16.472359 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:34:23.021680 543705 disk_info.go:125] begin check local disk info of client
I0320 06:34:23.024135 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:34:23.024142 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001c6300 0xc0001c6340]
E0320 06:34:23.408111 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:34:23.408124 543705 memory.go:184] no items to output this cycle
I0320 06:34:23.408160 543705 cpu.go:275] no items to output this cycle
E0320 06:34:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:34:33.409775 543705 memory.go:184] no items to output this cycle
I0320 06:34:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 06:34:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:34:43.409824 543705 memory.go:191] Add success.
I0320 06:34:43.409835 543705 cpu.go:282] Add success.
I0320 06:34:43.419912 543705 net.go:648] Add success.
I0320 06:34:43.422586 543705 net.go:770] primary dev: ETH0
I0320 06:34:43.422600 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:34:43.422612 543705 net.go:698] Add success.
I0320 06:34:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:34:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:34:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:34:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:34:53.409810 543705 memory.go:184] no items to output this cycle
I0320 06:34:53.409821 543705 cpu.go:275] no items to output this cycle
E0320 06:35:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:35:03.409785 543705 memory.go:184] no items to output this cycle
I0320 06:35:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 06:35:13.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:35:13.409827 543705 memory.go:191] Add success.
I0320 06:35:13.409837 543705 cpu.go:282] Add success.
W0320 06:35:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:35:13.409876 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:35:13.409880 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:35:13.420052 543705 net.go:648] Add success.
I0320 06:35:13.423028 543705 net.go:770] primary dev: ETH0
I0320 06:35:13.423041 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:35:13.423054 543705 net.go:698] Add success.
I0320 06:35:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:35:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:35:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 06:35:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:35:14.456568 543705 disk_worker.go:494] system disk:vda1
I0320 06:35:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:35:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:35:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:35:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:35:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:35:16.472409 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:35:23.025666 543705 disk_info.go:125] begin check local disk info of client
I0320 06:35:23.028122 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:35:23.028128 543705 disk_info.go:196] parse disk info done, disk is : [0xc000325f00 0xc000325f40]
E0320 06:35:23.407525 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:35:23.407541 543705 memory.go:184] no items to output this cycle
I0320 06:35:23.407748 543705 cpu.go:275] no items to output this cycle
E0320 06:35:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:35:33.409795 543705 memory.go:184] no items to output this cycle
I0320 06:35:33.409804 543705 cpu.go:275] no items to output this cycle
E0320 06:35:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:35:43.409805 543705 memory.go:191] Add success.
I0320 06:35:43.409812 543705 cpu.go:282] Add success.
I0320 06:35:43.419906 543705 net.go:648] Add success.
I0320 06:35:43.422485 543705 net.go:770] primary dev: ETH0
I0320 06:35:43.422498 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:35:43.422510 543705 net.go:698] Add success.
I0320 06:35:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:35:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:35:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:35:53.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:35:53.409812 543705 memory.go:184] no items to output this cycle
I0320 06:35:53.409826 543705 cpu.go:275] no items to output this cycle
E0320 06:36:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:36:03.409787 543705 memory.go:184] no items to output this cycle
I0320 06:36:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 06:36:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:36:13.409828 543705 memory.go:191] Add success.
I0320 06:36:13.409829 543705 cpu.go:282] Add success.
W0320 06:36:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:36:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:36:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:36:13.420147 543705 net.go:648] Add success.
I0320 06:36:13.422996 543705 net.go:770] primary dev: ETH0
I0320 06:36:13.423010 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:36:13.423022 543705 net.go:698] Add success.
I0320 06:36:13.747558 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"18269ea2-b662-497d-b949-c5c11a20098f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 06:36:13.747592 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 06:36:14.453970 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:36:14.455258 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:36:14.455269 543705 disk_worker.go:708] disk space is not compliant
W0320 06:36:14.455272 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:36:14.456850 543705 disk_worker.go:494] system disk:vda1
I0320 06:36:14.456883 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:36:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:36:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:36:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:36:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:36:16.472383 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:36:23.029687 543705 disk_info.go:125] begin check local disk info of client
I0320 06:36:23.032203 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:36:23.032209 543705 disk_info.go:196] parse disk info done, disk is : [0xc000492140 0xc000492180]
E0320 06:36:23.408152 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:36:23.408164 543705 memory.go:184] no items to output this cycle
I0320 06:36:23.408171 543705 cpu.go:275] no items to output this cycle
E0320 06:36:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:36:33.409826 543705 memory.go:184] no items to output this cycle
I0320 06:36:33.409839 543705 cpu.go:275] no items to output this cycle
I0320 06:36:38.189625 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 06:36:38.189632 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 06:36:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:36:43.410758 543705 memory.go:191] Add success.
I0320 06:36:43.409807 543705 cpu.go:282] Add success.
I0320 06:36:43.420485 543705 net.go:648] Add success.
I0320 06:36:43.423659 543705 net.go:770] primary dev: ETH0
I0320 06:36:43.423672 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:36:43.423685 543705 net.go:698] Add success.
I0320 06:36:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:36:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:36:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:36:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:36:53.409769 543705 memory.go:184] no items to output this cycle
I0320 06:36:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 06:37:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:37:03.409780 543705 memory.go:184] no items to output this cycle
I0320 06:37:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 06:37:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:37:13.409786 543705 memory.go:191] Add success.
W0320 06:37:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 06:37:13.409818 543705 cpu.go:282] Add success.
W0320 06:37:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:37:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:37:13.420504 543705 net.go:648] Add success.
I0320 06:37:13.423179 543705 net.go:770] primary dev: ETH0
I0320 06:37:13.423193 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:37:13.423207 543705 net.go:698] Add success.
I0320 06:37:13.452771 543705 event_worker.go:152] Polling the log file for events...
W0320 06:37:14.455119 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:37:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 06:37:14.455184 543705 disk_worker.go:728] disk inode is not compliant
E0320 06:37:14.455882 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 06:37:14.455890 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 06:37:14.455896 543705 custom_config.go:64] query custom config with name: gpu
I0320 06:37:14.456549 543705 disk_worker.go:494] system disk:vda1
I0320 06:37:14.456580 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 06:37:15.456799 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 06:37:15.456807 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:37:16.457944 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 06:37:16.457944 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 06:37:16.457998 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:37:16.458017 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:37:16.472343 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:37:23.033675 543705 disk_info.go:125] begin check local disk info of client
I0320 06:37:23.036089 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:37:23.036095 543705 disk_info.go:196] parse disk info done, disk is : [0xc000259180 0xc0002591c0]
E0320 06:37:23.407506 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:37:23.407519 543705 memory.go:184] no items to output this cycle
I0320 06:37:23.407601 543705 cpu.go:275] no items to output this cycle
E0320 06:37:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:37:33.409792 543705 memory.go:184] no items to output this cycle
I0320 06:37:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 06:37:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:37:43.409795 543705 memory.go:191] Add success.
I0320 06:37:43.409799 543705 cpu.go:282] Add success.
I0320 06:37:43.419982 543705 net.go:648] Add success.
I0320 06:37:43.422684 543705 net.go:770] primary dev: ETH0
I0320 06:37:43.422697 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:37:43.422710 543705 net.go:698] Add success.
I0320 06:37:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:37:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:37:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:37:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:37:53.409795 543705 memory.go:184] no items to output this cycle
I0320 06:37:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 06:38:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:38:03.409784 543705 memory.go:184] no items to output this cycle
I0320 06:38:03.409790 543705 cpu.go:275] no items to output this cycle
E0320 06:38:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:38:13.409789 543705 memory.go:191] Add success.
I0320 06:38:13.409812 543705 cpu.go:282] Add success.
W0320 06:38:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:38:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:38:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:38:13.420098 543705 net.go:648] Add success.
I0320 06:38:13.422687 543705 net.go:770] primary dev: ETH0
I0320 06:38:13.422700 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:38:13.422713 543705 net.go:698] Add success.
I0320 06:38:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:38:14.455158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:38:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0320 06:38:14.455170 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:38:14.456491 543705 disk_worker.go:494] system disk:vda1
I0320 06:38:14.456535 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:38:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:38:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:38:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:38:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:38:16.472390 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:38:23.037678 543705 disk_info.go:125] begin check local disk info of client
I0320 06:38:23.040112 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:38:23.040118 543705 disk_info.go:196] parse disk info done, disk is : [0xc000393740 0xc000393780]
E0320 06:38:23.407506 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:38:23.407519 543705 memory.go:184] no items to output this cycle
I0320 06:38:23.407547 543705 cpu.go:275] no items to output this cycle
E0320 06:38:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:38:33.409908 543705 memory.go:184] no items to output this cycle
I0320 06:38:33.409938 543705 cpu.go:275] no items to output this cycle
E0320 06:38:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:38:43.409793 543705 memory.go:191] Add success.
I0320 06:38:43.409797 543705 cpu.go:282] Add success.
I0320 06:38:43.419960 543705 net.go:648] Add success.
I0320 06:38:43.422867 543705 net.go:770] primary dev: ETH0
I0320 06:38:43.422879 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:38:43.422891 543705 net.go:698] Add success.
I0320 06:38:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:38:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:38:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:38:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:38:53.409797 543705 memory.go:184] no items to output this cycle
I0320 06:38:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 06:39:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:39:03.409800 543705 memory.go:184] no items to output this cycle
I0320 06:39:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 06:39:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:39:13.409790 543705 memory.go:191] Add success.
I0320 06:39:13.409817 543705 cpu.go:282] Add success.
W0320 06:39:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:39:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:39:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:39:13.420139 543705 net.go:648] Add success.
I0320 06:39:13.422826 543705 net.go:770] primary dev: ETH0
I0320 06:39:13.422840 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:39:13.422851 543705 net.go:698] Add success.
I0320 06:39:13.468395 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"cddcf3ad-1522-4519-bd10-8d963305d14b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 06:39:13.468427 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 06:39:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:39:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:39:14.455245 543705 disk_worker.go:708] disk space is not compliant
W0320 06:39:14.455248 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:39:14.456750 543705 disk_worker.go:494] system disk:vda1
I0320 06:39:14.456785 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:39:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:39:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:39:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:39:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:39:16.472395 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:39:23.041678 543705 disk_info.go:125] begin check local disk info of client
I0320 06:39:23.044075 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:39:23.044081 543705 disk_info.go:196] parse disk info done, disk is : [0xc000313140 0xc000313180]
E0320 06:39:23.407500 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:39:23.407512 543705 memory.go:184] no items to output this cycle
I0320 06:39:23.407539 543705 cpu.go:275] no items to output this cycle
E0320 06:39:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:39:33.409879 543705 memory.go:184] no items to output this cycle
I0320 06:39:33.409941 543705 cpu.go:275] no items to output this cycle
I0320 06:39:38.189732 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 06:39:38.189739 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 06:39:43.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:39:43.410583 543705 memory.go:191] Add success.
I0320 06:39:43.409809 543705 cpu.go:282] Add success.
I0320 06:39:43.420287 543705 net.go:648] Add success.
I0320 06:39:43.423158 543705 net.go:770] primary dev: ETH0
I0320 06:39:43.423170 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:39:43.423183 543705 net.go:698] Add success.
I0320 06:39:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:39:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:39:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:39:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:39:53.409788 543705 memory.go:184] no items to output this cycle
I0320 06:39:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 06:40:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:40:03.409801 543705 memory.go:184] no items to output this cycle
I0320 06:40:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 06:40:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:40:13.409827 543705 memory.go:191] Add success.
I0320 06:40:13.409835 543705 cpu.go:282] Add success.
W0320 06:40:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:40:13.409878 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:40:13.409882 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:40:13.420180 543705 net.go:648] Add success.
I0320 06:40:13.423020 543705 net.go:770] primary dev: ETH0
I0320 06:40:13.423034 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:40:13.423049 543705 net.go:698] Add success.
I0320 06:40:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:40:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:40:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0320 06:40:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:40:14.456601 543705 disk_worker.go:494] system disk:vda1
I0320 06:40:14.456632 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:40:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:40:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:40:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:40:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:40:16.472373 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:40:23.045672 543705 disk_info.go:125] begin check local disk info of client
I0320 06:40:23.048170 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:40:23.048177 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329cc0 0xc000329d00]
E0320 06:40:23.407531 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:40:23.407547 543705 memory.go:184] no items to output this cycle
I0320 06:40:23.407562 543705 cpu.go:275] no items to output this cycle
E0320 06:40:33.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:40:33.409828 543705 memory.go:184] no items to output this cycle
I0320 06:40:33.409841 543705 cpu.go:275] no items to output this cycle
E0320 06:40:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:40:43.409785 543705 memory.go:191] Add success.
I0320 06:40:43.409798 543705 cpu.go:282] Add success.
I0320 06:40:43.420012 543705 net.go:648] Add success.
I0320 06:40:43.422633 543705 net.go:770] primary dev: ETH0
I0320 06:40:43.422645 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:40:43.422657 543705 net.go:698] Add success.
I0320 06:40:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:40:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:40:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:40:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:40:53.409766 543705 memory.go:184] no items to output this cycle
I0320 06:40:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 06:41:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:41:03.409798 543705 memory.go:184] no items to output this cycle
I0320 06:41:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 06:41:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:41:13.409790 543705 memory.go:191] Add success.
I0320 06:41:13.409806 543705 cpu.go:282] Add success.
W0320 06:41:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:41:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:41:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:41:13.420145 543705 net.go:648] Add success.
I0320 06:41:13.422912 543705 net.go:770] primary dev: ETH0
I0320 06:41:13.422925 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:41:13.422938 543705 net.go:698] Add success.
I0320 06:41:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:41:14.455108 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:41:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 06:41:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:41:14.456560 543705 disk_worker.go:494] system disk:vda1
I0320 06:41:14.456589 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:41:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:41:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:41:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:41:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:41:16.472380 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:41:23.049677 543705 disk_info.go:125] begin check local disk info of client
I0320 06:41:23.052116 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:41:23.052123 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034e580 0xc00034e5c0]
E0320 06:41:23.407990 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:41:23.408006 543705 memory.go:184] no items to output this cycle
I0320 06:41:23.408025 543705 cpu.go:275] no items to output this cycle
E0320 06:41:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:41:33.409803 543705 memory.go:184] no items to output this cycle
I0320 06:41:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 06:41:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:41:43.409814 543705 memory.go:191] Add success.
I0320 06:41:43.409820 543705 cpu.go:282] Add success.
I0320 06:41:43.420006 543705 net.go:648] Add success.
I0320 06:41:43.422975 543705 net.go:770] primary dev: ETH0
I0320 06:41:43.422994 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:41:43.423008 543705 net.go:698] Add success.
I0320 06:41:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:41:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:41:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:41:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:41:53.409781 543705 memory.go:184] no items to output this cycle
I0320 06:41:53.409780 543705 cpu.go:275] no items to output this cycle
E0320 06:42:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:42:03.409791 543705 memory.go:184] no items to output this cycle
I0320 06:42:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 06:42:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:42:13.409784 543705 memory.go:191] Add success.
W0320 06:42:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 06:42:13.409814 543705 cpu.go:282] Add success.
W0320 06:42:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:42:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:42:13.420305 543705 net.go:648] Add success.
I0320 06:42:13.423161 543705 net.go:770] primary dev: ETH0
I0320 06:42:13.423176 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:42:13.423191 543705 net.go:698] Add success.
I0320 06:42:13.464562 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e92258d4-c33c-4114-91a0-6b084885cd0b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 06:42:13.464596 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 06:42:14.455166 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:42:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0320 06:42:14.455179 543705 disk_worker.go:728] disk inode is not compliant
E0320 06:42:14.457008 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 06:42:14.457017 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 06:42:14.457023 543705 custom_config.go:64] query custom config with name: gpu
I0320 06:42:14.457120 543705 disk_worker.go:494] system disk:vda1
I0320 06:42:14.457163 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 06:42:15.456819 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 06:42:15.456829 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:42:16.457931 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 06:42:16.457940 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 06:42:16.457991 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:42:16.458009 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:42:16.472348 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:42:23.053673 543705 disk_info.go:125] begin check local disk info of client
I0320 06:42:23.056067 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:42:23.056073 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab6c0 0xc0001ab700]
E0320 06:42:23.407512 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:42:23.407527 543705 memory.go:184] no items to output this cycle
I0320 06:42:23.407534 543705 cpu.go:275] no items to output this cycle
E0320 06:42:33.409873 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:42:33.409892 543705 memory.go:184] no items to output this cycle
I0320 06:42:33.409953 543705 cpu.go:275] no items to output this cycle
I0320 06:42:38.189878 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 06:42:38.189883 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 06:42:43.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:42:43.410807 543705 memory.go:191] Add success.
I0320 06:42:43.409806 543705 cpu.go:282] Add success.
I0320 06:42:43.420541 543705 net.go:648] Add success.
I0320 06:42:43.423458 543705 net.go:770] primary dev: ETH0
I0320 06:42:43.423473 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:42:43.423488 543705 net.go:698] Add success.
I0320 06:42:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:42:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:42:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:42:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:42:53.409765 543705 memory.go:184] no items to output this cycle
I0320 06:42:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 06:43:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:43:03.409800 543705 memory.go:184] no items to output this cycle
I0320 06:43:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 06:43:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:43:13.409785 543705 memory.go:191] Add success.
I0320 06:43:13.409811 543705 cpu.go:282] Add success.
W0320 06:43:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:43:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:43:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:43:13.420076 543705 net.go:648] Add success.
I0320 06:43:13.422779 543705 net.go:770] primary dev: ETH0
I0320 06:43:13.422795 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:43:13.422808 543705 net.go:698] Add success.
I0320 06:43:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:43:14.455137 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:43:14.455213 543705 disk_worker.go:708] disk space is not compliant
W0320 06:43:14.455217 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:43:14.456592 543705 disk_worker.go:494] system disk:vda1
I0320 06:43:14.456622 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:43:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:43:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:43:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:43:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:43:16.472380 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:43:23.057671 543705 disk_info.go:125] begin check local disk info of client
I0320 06:43:23.060181 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:43:23.060190 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004ec000 0xc0004ec040]
E0320 06:43:23.407534 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:43:23.407550 543705 memory.go:184] no items to output this cycle
I0320 06:43:23.407563 543705 cpu.go:275] no items to output this cycle
E0320 06:43:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:43:33.409774 543705 memory.go:184] no items to output this cycle
I0320 06:43:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 06:43:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:43:43.409918 543705 memory.go:191] Add success.
I0320 06:43:43.409927 543705 cpu.go:282] Add success.
I0320 06:43:43.419735 543705 net.go:648] Add success.
I0320 06:43:43.422819 543705 net.go:770] primary dev: ETH0
I0320 06:43:43.422837 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:43:43.422852 543705 net.go:698] Add success.
I0320 06:43:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:43:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:43:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:43:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:43:53.409794 543705 memory.go:184] no items to output this cycle
I0320 06:43:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 06:44:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:44:03.409778 543705 memory.go:184] no items to output this cycle
I0320 06:44:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 06:44:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:44:13.409821 543705 memory.go:191] Add success.
I0320 06:44:13.409825 543705 cpu.go:282] Add success.
W0320 06:44:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:44:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:44:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:44:13.420244 543705 net.go:648] Add success.
I0320 06:44:13.422863 543705 net.go:770] primary dev: ETH0
I0320 06:44:13.422875 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:44:13.422888 543705 net.go:698] Add success.
I0320 06:44:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:44:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:44:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 06:44:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:44:14.456574 543705 disk_worker.go:494] system disk:vda1
I0320 06:44:14.456603 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:44:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:44:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:44:16.458024 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:44:16.458042 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:44:16.472414 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:44:23.061679 543705 disk_info.go:125] begin check local disk info of client
I0320 06:44:23.064050 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:44:23.064056 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab9c0 0xc0001aba00]
E0320 06:44:23.407851 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:44:23.407867 543705 memory.go:184] no items to output this cycle
I0320 06:44:23.407880 543705 cpu.go:275] no items to output this cycle
E0320 06:44:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:44:33.409808 543705 memory.go:184] no items to output this cycle
I0320 06:44:33.409824 543705 cpu.go:275] no items to output this cycle
E0320 06:44:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:44:43.409777 543705 memory.go:191] Add success.
I0320 06:44:43.409798 543705 cpu.go:282] Add success.
I0320 06:44:43.419981 543705 net.go:648] Add success.
I0320 06:44:43.422849 543705 net.go:770] primary dev: ETH0
I0320 06:44:43.422865 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:44:43.422878 543705 net.go:698] Add success.
I0320 06:44:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:44:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:44:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:44:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:44:53.409792 543705 memory.go:184] no items to output this cycle
I0320 06:44:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 06:45:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:45:03.409772 543705 memory.go:184] no items to output this cycle
I0320 06:45:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 06:45:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:45:13.409798 543705 memory.go:191] Add success.
I0320 06:45:13.409801 543705 cpu.go:282] Add success.
W0320 06:45:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:45:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:45:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:45:13.420157 543705 net.go:648] Add success.
I0320 06:45:13.422963 543705 net.go:770] primary dev: ETH0
I0320 06:45:13.422977 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:45:13.422989 543705 net.go:698] Add success.
I0320 06:45:13.470048 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"592e429a-a482-4e1c-91c8-bb282bcf365b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 06:45:13.470081 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 06:45:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:45:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:45:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 06:45:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:45:14.456539 543705 disk_worker.go:494] system disk:vda1
I0320 06:45:14.456581 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:45:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:45:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:45:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:45:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:45:16.472417 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:45:23.065674 543705 disk_info.go:125] begin check local disk info of client
I0320 06:45:23.068147 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:45:23.068154 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b100 0xc00007b140]
E0320 06:45:23.407903 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:45:23.407914 543705 memory.go:184] no items to output this cycle
I0320 06:45:23.407954 543705 cpu.go:275] no items to output this cycle
E0320 06:45:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:45:33.409807 543705 memory.go:184] no items to output this cycle
I0320 06:45:33.409821 543705 cpu.go:275] no items to output this cycle
I0320 06:45:38.193658 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 06:45:38.193666 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 06:45:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:45:43.410868 543705 memory.go:191] Add success.
I0320 06:45:43.409817 543705 cpu.go:282] Add success.
I0320 06:45:43.420561 543705 net.go:648] Add success.
I0320 06:45:43.423460 543705 net.go:770] primary dev: ETH0
I0320 06:45:43.423472 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:45:43.423484 543705 net.go:698] Add success.
I0320 06:45:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:45:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:45:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:45:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:45:53.409780 543705 memory.go:184] no items to output this cycle
I0320 06:45:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 06:46:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:46:03.409774 543705 memory.go:184] no items to output this cycle
I0320 06:46:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 06:46:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:46:13.409789 543705 memory.go:191] Add success.
W0320 06:46:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 06:46:13.409822 543705 cpu.go:282] Add success.
W0320 06:46:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:46:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:46:13.420233 543705 net.go:648] Add success.
I0320 06:46:13.423038 543705 net.go:770] primary dev: ETH0
I0320 06:46:13.423052 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:46:13.423066 543705 net.go:698] Add success.
I0320 06:46:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:46:14.455105 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:46:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0320 06:46:14.455170 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:46:14.456510 543705 disk_worker.go:494] system disk:vda1
I0320 06:46:14.456555 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:46:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:46:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:46:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:46:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:46:16.472444 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:46:23.069682 543705 disk_info.go:125] begin check local disk info of client
I0320 06:46:23.072097 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:46:23.072102 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aae40 0xc0001aae80]
E0320 06:46:23.407889 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:46:23.407908 543705 memory.go:184] no items to output this cycle
I0320 06:46:23.407927 543705 cpu.go:275] no items to output this cycle
E0320 06:46:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:46:33.409777 543705 memory.go:184] no items to output this cycle
I0320 06:46:33.409800 543705 cpu.go:275] no items to output this cycle
E0320 06:46:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:46:43.409792 543705 memory.go:191] Add success.
I0320 06:46:43.409796 543705 cpu.go:282] Add success.
I0320 06:46:43.419945 543705 net.go:648] Add success.
I0320 06:46:43.423032 543705 net.go:770] primary dev: ETH0
I0320 06:46:43.423046 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:46:43.423058 543705 net.go:698] Add success.
I0320 06:46:46.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:46:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:46:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:46:53.409887 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:46:53.409927 543705 memory.go:184] no items to output this cycle
I0320 06:46:53.410042 543705 cpu.go:275] no items to output this cycle
E0320 06:47:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:47:03.409776 543705 memory.go:184] no items to output this cycle
I0320 06:47:03.409784 543705 cpu.go:275] no items to output this cycle
E0320 06:47:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:47:13.409791 543705 memory.go:191] Add success.
I0320 06:47:13.409810 543705 cpu.go:282] Add success.
W0320 06:47:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:47:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:47:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:47:13.420079 543705 net.go:648] Add success.
I0320 06:47:13.423368 543705 net.go:770] primary dev: ETH0
I0320 06:47:13.423382 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:47:13.423394 543705 net.go:698] Add success.
I0320 06:47:13.452910 543705 event_worker.go:152] Polling the log file for events...
W0320 06:47:14.455168 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:47:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0320 06:47:14.455180 543705 disk_worker.go:728] disk inode is not compliant
E0320 06:47:14.455871 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 06:47:14.455880 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 06:47:14.455886 543705 custom_config.go:64] query custom config with name: gpu
I0320 06:47:14.456610 543705 disk_worker.go:494] system disk:vda1
I0320 06:47:14.456656 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 06:47:15.456833 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 06:47:15.456842 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:47:16.457949 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 06:47:16.457959 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 06:47:16.458004 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:47:16.458021 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:47:16.472339 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:47:23.073673 543705 disk_info.go:125] begin check local disk info of client
I0320 06:47:23.076059 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:47:23.076065 543705 disk_info.go:196] parse disk info done, disk is : [0xc000312280 0xc0003122c0]
E0320 06:47:23.407814 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:47:23.407829 543705 memory.go:184] no items to output this cycle
I0320 06:47:23.407848 543705 cpu.go:275] no items to output this cycle
E0320 06:47:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:47:33.409784 543705 memory.go:184] no items to output this cycle
I0320 06:47:33.409800 543705 cpu.go:275] no items to output this cycle
E0320 06:47:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:47:43.409815 543705 memory.go:191] Add success.
I0320 06:47:43.409820 543705 cpu.go:282] Add success.
I0320 06:47:43.420046 543705 net.go:648] Add success.
I0320 06:47:43.423655 543705 net.go:770] primary dev: ETH0
I0320 06:47:43.423669 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:47:43.423682 543705 net.go:698] Add success.
I0320 06:47:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:47:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:47:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:47:53.409869 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:47:53.409893 543705 memory.go:184] no items to output this cycle
I0320 06:47:53.409951 543705 cpu.go:275] no items to output this cycle
E0320 06:48:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:48:03.409804 543705 memory.go:184] no items to output this cycle
I0320 06:48:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 06:48:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:48:13.409782 543705 memory.go:191] Add success.
W0320 06:48:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 06:48:13.409812 543705 cpu.go:282] Add success.
W0320 06:48:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:48:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:48:13.420173 543705 net.go:648] Add success.
I0320 06:48:13.422671 543705 net.go:770] primary dev: ETH0
I0320 06:48:13.422690 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:48:13.422704 543705 net.go:698] Add success.
I0320 06:48:13.468118 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2b6f5bff-566b-46a7-b0ec-8aee48f60cf8","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 06:48:13.468151 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 06:48:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:48:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:48:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 06:48:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:48:14.456540 543705 disk_worker.go:494] system disk:vda1
I0320 06:48:14.456583 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:48:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:48:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:48:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:48:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:48:16.472404 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:48:23.077680 543705 disk_info.go:125] begin check local disk info of client
I0320 06:48:23.080178 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:48:23.080184 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa400 0xc0001fa440]
E0320 06:48:23.407518 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:48:23.407535 543705 memory.go:184] no items to output this cycle
I0320 06:48:23.407544 543705 cpu.go:275] no items to output this cycle
E0320 06:48:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:48:33.409791 543705 memory.go:184] no items to output this cycle
I0320 06:48:33.409807 543705 cpu.go:275] no items to output this cycle
I0320 06:48:38.197673 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 06:48:38.197680 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 06:48:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:48:43.410701 543705 memory.go:191] Add success.
I0320 06:48:43.409827 543705 cpu.go:282] Add success.
I0320 06:48:43.420395 543705 net.go:648] Add success.
I0320 06:48:43.422784 543705 net.go:770] primary dev: ETH0
I0320 06:48:43.422799 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:48:43.422814 543705 net.go:698] Add success.
I0320 06:48:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:48:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:48:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:48:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:48:53.409803 543705 memory.go:184] no items to output this cycle
I0320 06:48:53.409814 543705 cpu.go:275] no items to output this cycle
E0320 06:49:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:49:03.409783 543705 memory.go:184] no items to output this cycle
I0320 06:49:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 06:49:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:49:13.409812 543705 memory.go:191] Add success.
I0320 06:49:13.409813 543705 cpu.go:282] Add success.
W0320 06:49:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:49:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:49:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:49:13.420149 543705 net.go:648] Add success.
I0320 06:49:13.423149 543705 net.go:770] primary dev: ETH0
I0320 06:49:13.423162 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:49:13.423174 543705 net.go:698] Add success.
I0320 06:49:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:49:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:49:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 06:49:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:49:14.456581 543705 disk_worker.go:494] system disk:vda1
I0320 06:49:14.456610 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:49:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:49:16.458003 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:49:16.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:49:16.458084 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:49:16.472408 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:49:23.081681 543705 disk_info.go:125] begin check local disk info of client
I0320 06:49:23.084060 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:49:23.084066 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003127c0 0xc000312800]
E0320 06:49:23.407528 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:49:23.407542 543705 memory.go:184] no items to output this cycle
I0320 06:49:23.407555 543705 cpu.go:275] no items to output this cycle
E0320 06:49:33.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:49:33.409817 543705 memory.go:184] no items to output this cycle
I0320 06:49:33.409827 543705 cpu.go:275] no items to output this cycle
E0320 06:49:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:49:43.409829 543705 memory.go:191] Add success.
I0320 06:49:43.409833 543705 cpu.go:282] Add success.
I0320 06:49:43.419982 543705 net.go:648] Add success.
I0320 06:49:43.422845 543705 net.go:770] primary dev: ETH0
I0320 06:49:43.422858 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:49:43.422870 543705 net.go:698] Add success.
I0320 06:49:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:49:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:49:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:49:53.409872 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:49:53.409895 543705 memory.go:184] no items to output this cycle
I0320 06:49:53.409975 543705 cpu.go:275] no items to output this cycle
E0320 06:50:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:50:03.409803 543705 cpu.go:275] no items to output this cycle
I0320 06:50:03.409805 543705 memory.go:184] no items to output this cycle
E0320 06:50:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:50:13.409835 543705 memory.go:191] Add success.
I0320 06:50:13.409838 543705 cpu.go:282] Add success.
W0320 06:50:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:50:13.409878 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:50:13.409881 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:50:13.420272 543705 net.go:648] Add success.
I0320 06:50:13.423484 543705 net.go:770] primary dev: ETH0
I0320 06:50:13.423496 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:50:13.423508 543705 net.go:698] Add success.
I0320 06:50:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:50:14.455103 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:50:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 06:50:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:50:14.456578 543705 disk_worker.go:494] system disk:vda1
I0320 06:50:14.456610 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:50:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:50:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:50:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:50:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:50:16.472393 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:50:23.085677 543705 disk_info.go:125] begin check local disk info of client
I0320 06:50:23.088162 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:50:23.088169 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037f040 0xc00037f080]
E0320 06:50:23.407849 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:50:23.407866 543705 memory.go:184] no items to output this cycle
I0320 06:50:23.407888 543705 cpu.go:275] no items to output this cycle
E0320 06:50:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:50:33.409803 543705 memory.go:184] no items to output this cycle
I0320 06:50:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 06:50:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:50:43.409783 543705 memory.go:191] Add success.
I0320 06:50:43.409801 543705 cpu.go:282] Add success.
I0320 06:50:43.419858 543705 net.go:648] Add success.
I0320 06:50:43.422752 543705 net.go:770] primary dev: ETH0
I0320 06:50:43.422765 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:50:43.422776 543705 net.go:698] Add success.
I0320 06:50:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:50:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:50:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:50:53.409892 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:50:53.409914 543705 memory.go:184] no items to output this cycle
I0320 06:50:53.409927 543705 cpu.go:275] no items to output this cycle
E0320 06:51:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:51:03.409779 543705 cpu.go:275] no items to output this cycle
I0320 06:51:03.409794 543705 memory.go:184] no items to output this cycle
E0320 06:51:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:51:13.409797 543705 memory.go:191] Add success.
I0320 06:51:13.409801 543705 cpu.go:282] Add success.
W0320 06:51:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:51:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:51:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:51:13.420223 543705 net.go:648] Add success.
I0320 06:51:13.423109 543705 net.go:770] primary dev: ETH0
I0320 06:51:13.423124 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:51:13.423139 543705 net.go:698] Add success.
I0320 06:51:13.468774 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"caca9206-7199-49a9-abff-f44606e6aa46","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 06:51:13.468810 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 06:51:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:51:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:51:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0320 06:51:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:51:14.456575 543705 disk_worker.go:494] system disk:vda1
I0320 06:51:14.456606 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:51:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:51:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:51:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:51:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:51:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:51:23.089678 543705 disk_info.go:125] begin check local disk info of client
I0320 06:51:23.092138 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:51:23.092145 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003804c0 0xc000380500]
E0320 06:51:23.407512 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:51:23.407525 543705 memory.go:184] no items to output this cycle
I0320 06:51:23.407553 543705 cpu.go:275] no items to output this cycle
E0320 06:51:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:51:33.409786 543705 memory.go:184] no items to output this cycle
I0320 06:51:33.409786 543705 cpu.go:275] no items to output this cycle
I0320 06:51:38.201697 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 06:51:38.201703 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 06:51:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:51:43.410713 543705 memory.go:191] Add success.
I0320 06:51:43.409814 543705 cpu.go:282] Add success.
I0320 06:51:43.419736 543705 net.go:648] Add success.
I0320 06:51:43.422745 543705 net.go:770] primary dev: ETH0
I0320 06:51:43.422759 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:51:43.422773 543705 net.go:698] Add success.
I0320 06:51:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:51:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:51:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:51:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:51:53.409773 543705 memory.go:184] no items to output this cycle
I0320 06:51:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 06:52:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:52:03.409781 543705 cpu.go:275] no items to output this cycle
I0320 06:52:03.409786 543705 memory.go:184] no items to output this cycle
E0320 06:52:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:52:13.409819 543705 memory.go:191] Add success.
I0320 06:52:13.409826 543705 cpu.go:282] Add success.
W0320 06:52:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:52:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:52:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:52:13.420133 543705 net.go:648] Add success.
I0320 06:52:13.422866 543705 net.go:770] primary dev: ETH0
I0320 06:52:13.422880 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:52:13.422894 543705 net.go:698] Add success.
W0320 06:52:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:52:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 06:52:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:52:14.456785 543705 disk_worker.go:494] system disk:vda1
I0320 06:52:14.456827 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 06:52:14.457138 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 06:52:14.457146 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 06:52:14.457151 543705 custom_config.go:64] query custom config with name: gpu
E0320 06:52:15.456832 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 06:52:15.456840 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:52:16.457907 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 06:52:16.457907 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 06:52:16.457972 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:52:16.457991 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:52:16.472374 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:52:23.093675 543705 disk_info.go:125] begin check local disk info of client
I0320 06:52:23.096227 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:52:23.096234 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032ea40 0xc00032ea80]
E0320 06:52:23.407848 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:52:23.407860 543705 memory.go:184] no items to output this cycle
I0320 06:52:23.407894 543705 cpu.go:275] no items to output this cycle
E0320 06:52:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:52:33.409788 543705 memory.go:184] no items to output this cycle
I0320 06:52:33.409789 543705 cpu.go:275] no items to output this cycle
E0320 06:52:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:52:43.409800 543705 memory.go:191] Add success.
I0320 06:52:43.409802 543705 cpu.go:282] Add success.
I0320 06:52:43.419984 543705 net.go:648] Add success.
I0320 06:52:43.422755 543705 net.go:770] primary dev: ETH0
I0320 06:52:43.422767 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:52:43.422779 543705 net.go:698] Add success.
I0320 06:52:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:52:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:52:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:52:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:52:53.409770 543705 memory.go:184] no items to output this cycle
I0320 06:52:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 06:53:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:53:03.409783 543705 cpu.go:275] no items to output this cycle
I0320 06:53:03.409786 543705 memory.go:184] no items to output this cycle
E0320 06:53:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:53:13.409829 543705 memory.go:191] Add success.
I0320 06:53:13.409830 543705 cpu.go:282] Add success.
W0320 06:53:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:53:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:53:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:53:13.420260 543705 net.go:648] Add success.
I0320 06:53:13.423495 543705 net.go:770] primary dev: ETH0
I0320 06:53:13.423509 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:53:13.423520 543705 net.go:698] Add success.
I0320 06:53:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:53:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:53:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 06:53:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:53:14.456554 543705 disk_worker.go:494] system disk:vda1
I0320 06:53:14.456584 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:53:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:53:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:53:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:53:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:53:16.472380 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:53:23.097677 543705 disk_info.go:125] begin check local disk info of client
I0320 06:53:23.100172 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:53:23.100178 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b78c0 0xc0002b7900]
E0320 06:53:23.407804 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:53:23.407816 543705 memory.go:184] no items to output this cycle
I0320 06:53:23.407825 543705 cpu.go:275] no items to output this cycle
E0320 06:53:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:53:33.409809 543705 memory.go:184] no items to output this cycle
I0320 06:53:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 06:53:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:53:43.409826 543705 memory.go:191] Add success.
I0320 06:53:43.409835 543705 cpu.go:282] Add success.
I0320 06:53:43.420022 543705 net.go:648] Add success.
I0320 06:53:43.422670 543705 net.go:770] primary dev: ETH0
I0320 06:53:43.422689 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:53:43.422704 543705 net.go:698] Add success.
I0320 06:53:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:53:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:53:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:53:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:53:53.409804 543705 memory.go:184] no items to output this cycle
I0320 06:53:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 06:54:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:54:03.409778 543705 memory.go:184] no items to output this cycle
I0320 06:54:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 06:54:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:54:13.409802 543705 memory.go:191] Add success.
I0320 06:54:13.409807 543705 cpu.go:282] Add success.
W0320 06:54:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:54:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:54:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:54:13.420132 543705 net.go:648] Add success.
I0320 06:54:13.422654 543705 net.go:770] primary dev: ETH0
I0320 06:54:13.422670 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:54:13.422684 543705 net.go:698] Add success.
I0320 06:54:13.464291 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1e743f54-5c76-4cc1-803d-1218406f212d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 06:54:13.464326 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 06:54:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:54:14.455164 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:54:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0320 06:54:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:54:14.456521 543705 disk_worker.go:494] system disk:vda1
I0320 06:54:14.456564 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:54:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:54:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:54:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:54:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:54:16.472423 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:54:23.101677 543705 disk_info.go:125] begin check local disk info of client
I0320 06:54:23.104159 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:54:23.104167 543705 disk_info.go:196] parse disk info done, disk is : [0xc0005455c0 0xc000545600]
E0320 06:54:23.407772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:54:23.407789 543705 memory.go:184] no items to output this cycle
I0320 06:54:23.407804 543705 cpu.go:275] no items to output this cycle
E0320 06:54:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:54:33.409784 543705 memory.go:184] no items to output this cycle
I0320 06:54:33.409802 543705 cpu.go:275] no items to output this cycle
I0320 06:54:38.205717 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 06:54:38.205723 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 06:54:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:54:43.410688 543705 memory.go:191] Add success.
I0320 06:54:43.409808 543705 cpu.go:282] Add success.
I0320 06:54:43.420404 543705 net.go:648] Add success.
I0320 06:54:43.423333 543705 net.go:770] primary dev: ETH0
I0320 06:54:43.423351 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:54:43.423366 543705 net.go:698] Add success.
I0320 06:54:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:54:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:54:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:54:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:54:53.409766 543705 memory.go:184] no items to output this cycle
I0320 06:54:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 06:55:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:55:03.409772 543705 memory.go:184] no items to output this cycle
I0320 06:55:03.409778 543705 cpu.go:275] no items to output this cycle
E0320 06:55:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:55:13.409805 543705 memory.go:191] Add success.
I0320 06:55:13.409809 543705 cpu.go:282] Add success.
W0320 06:55:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:55:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:55:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:55:13.420127 543705 net.go:648] Add success.
I0320 06:55:13.422875 543705 net.go:770] primary dev: ETH0
I0320 06:55:13.422889 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:55:13.422901 543705 net.go:698] Add success.
I0320 06:55:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:55:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:55:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0320 06:55:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:55:14.456581 543705 disk_worker.go:494] system disk:vda1
I0320 06:55:14.456610 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:55:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:55:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:55:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:55:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:55:16.472391 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:55:23.105681 543705 disk_info.go:125] begin check local disk info of client
I0320 06:55:23.108177 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:55:23.108198 543705 disk_info.go:196] parse disk info done, disk is : [0xc000564080 0xc0005640c0]
E0320 06:55:23.407744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:55:23.407757 543705 memory.go:184] no items to output this cycle
I0320 06:55:23.407783 543705 cpu.go:275] no items to output this cycle
E0320 06:55:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:55:33.409807 543705 memory.go:184] no items to output this cycle
I0320 06:55:33.409824 543705 cpu.go:275] no items to output this cycle
E0320 06:55:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:55:43.409816 543705 memory.go:191] Add success.
I0320 06:55:43.409826 543705 cpu.go:282] Add success.
I0320 06:55:43.419992 543705 net.go:648] Add success.
I0320 06:55:43.422731 543705 net.go:770] primary dev: ETH0
I0320 06:55:43.422744 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:55:43.422756 543705 net.go:698] Add success.
I0320 06:55:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:55:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:55:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:55:53.410249 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:55:53.410266 543705 memory.go:184] no items to output this cycle
I0320 06:55:53.410275 543705 cpu.go:275] no items to output this cycle
E0320 06:56:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:56:03.409778 543705 memory.go:184] no items to output this cycle
I0320 06:56:03.409781 543705 cpu.go:275] no items to output this cycle
E0320 06:56:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:56:13.409803 543705 memory.go:191] Add success.
I0320 06:56:13.409818 543705 cpu.go:282] Add success.
W0320 06:56:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:56:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:56:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:56:13.420137 543705 net.go:648] Add success.
I0320 06:56:13.422818 543705 net.go:770] primary dev: ETH0
I0320 06:56:13.422831 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:56:13.422843 543705 net.go:698] Add success.
I0320 06:56:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:56:14.455202 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:56:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0320 06:56:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:56:14.456603 543705 disk_worker.go:494] system disk:vda1
I0320 06:56:14.456632 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:56:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:56:16.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:56:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:56:16.458074 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:56:16.472458 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:56:23.109676 543705 disk_info.go:125] begin check local disk info of client
I0320 06:56:23.112239 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:56:23.112245 543705 disk_info.go:196] parse disk info done, disk is : [0xc000265c00 0xc000265c40]
E0320 06:56:23.407803 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:56:23.407818 543705 memory.go:184] no items to output this cycle
I0320 06:56:23.407828 543705 cpu.go:275] no items to output this cycle
E0320 06:56:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:56:33.409801 543705 memory.go:184] no items to output this cycle
I0320 06:56:33.409817 543705 cpu.go:275] no items to output this cycle
E0320 06:56:43.409860 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:56:43.409889 543705 memory.go:191] Add success.
I0320 06:56:43.409963 543705 cpu.go:282] Add success.
I0320 06:56:43.419709 543705 net.go:648] Add success.
I0320 06:56:43.422560 543705 net.go:770] primary dev: ETH0
I0320 06:56:43.422572 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:56:43.422584 543705 net.go:698] Add success.
I0320 06:56:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:56:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:56:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:56:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:56:53.409795 543705 memory.go:184] no items to output this cycle
I0320 06:56:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 06:57:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:57:03.409767 543705 memory.go:184] no items to output this cycle
I0320 06:57:03.409802 543705 cpu.go:275] no items to output this cycle
E0320 06:57:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:57:13.409793 543705 memory.go:191] Add success.
I0320 06:57:13.409810 543705 cpu.go:282] Add success.
W0320 06:57:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:57:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:57:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:57:13.420135 543705 net.go:648] Add success.
I0320 06:57:13.429007 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 06:57:13.429079 543705 net.go:770] primary dev: ETH0
I0320 06:57:13.429093 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:57:13.429108 543705 net.go:698] Add success.
I0320 06:57:13.453658 543705 event_worker.go:152] Polling the log file for events...
I0320 06:57:13.603316 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8f3b8a00-658f-4cd7-9462-318cdb08186b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 06:57:13.603357 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 06:57:14.454890 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:57:14.454903 543705 disk_worker.go:708] disk space is not compliant
W0320 06:57:14.454907 543705 disk_worker.go:728] disk inode is not compliant
E0320 06:57:14.455614 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 06:57:14.455623 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 06:57:14.455628 543705 custom_config.go:64] query custom config with name: gpu
I0320 06:57:14.456490 543705 disk_worker.go:494] system disk:vda1
I0320 06:57:14.456520 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 06:57:15.456892 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 06:57:15.456903 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:57:16.458027 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 06:57:16.458029 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 06:57:16.458081 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:57:16.458104 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:57:16.472506 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:57:23.113680 543705 disk_info.go:125] begin check local disk info of client
I0320 06:57:23.116169 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:57:23.116175 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037f780 0xc00037f7c0]
E0320 06:57:23.407690 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:57:23.407702 543705 memory.go:184] no items to output this cycle
I0320 06:57:23.407736 543705 cpu.go:275] no items to output this cycle
E0320 06:57:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:57:33.409784 543705 memory.go:184] no items to output this cycle
I0320 06:57:33.409787 543705 cpu.go:275] no items to output this cycle
I0320 06:57:38.209746 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 06:57:38.209751 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 06:57:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:57:43.410682 543705 memory.go:191] Add success.
I0320 06:57:43.409826 543705 cpu.go:282] Add success.
I0320 06:57:43.420197 543705 net.go:770] primary dev: ETH0
I0320 06:57:43.420210 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:57:43.420222 543705 net.go:698] Add success.
I0320 06:57:43.420622 543705 net.go:648] Add success.
I0320 06:57:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:57:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:57:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:57:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:57:53.409799 543705 memory.go:184] no items to output this cycle
I0320 06:57:53.409814 543705 cpu.go:275] no items to output this cycle
E0320 06:58:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:58:03.409775 543705 memory.go:184] no items to output this cycle
I0320 06:58:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 06:58:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:58:13.409793 543705 memory.go:191] Add success.
I0320 06:58:13.409810 543705 cpu.go:282] Add success.
W0320 06:58:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:58:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:58:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:58:13.420100 543705 net.go:648] Add success.
I0320 06:58:13.422589 543705 net.go:770] primary dev: ETH0
I0320 06:58:13.422604 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:58:13.422617 543705 net.go:698] Add success.
I0320 06:58:14.453934 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:58:14.455243 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:58:14.455257 543705 disk_worker.go:708] disk space is not compliant
W0320 06:58:14.455261 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:58:14.457272 543705 disk_worker.go:494] system disk:vda1
I0320 06:58:14.457320 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:58:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:58:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:58:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:58:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:58:16.472444 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:58:23.117674 543705 disk_info.go:125] begin check local disk info of client
I0320 06:58:23.120231 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:58:23.120238 543705 disk_info.go:196] parse disk info done, disk is : [0xc000256200 0xc000256240]
E0320 06:58:23.407729 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:58:23.407741 543705 memory.go:184] no items to output this cycle
I0320 06:58:23.407777 543705 cpu.go:275] no items to output this cycle
E0320 06:58:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:58:33.409770 543705 memory.go:184] no items to output this cycle
I0320 06:58:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 06:58:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:58:43.409816 543705 memory.go:191] Add success.
I0320 06:58:43.409826 543705 cpu.go:282] Add success.
I0320 06:58:43.419745 543705 net.go:648] Add success.
I0320 06:58:43.422855 543705 net.go:770] primary dev: ETH0
I0320 06:58:43.422885 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:58:43.422898 543705 net.go:698] Add success.
I0320 06:58:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:58:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:58:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:58:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:58:53.409776 543705 memory.go:184] no items to output this cycle
I0320 06:58:53.409779 543705 cpu.go:275] no items to output this cycle
E0320 06:59:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:59:03.409778 543705 memory.go:184] no items to output this cycle
I0320 06:59:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 06:59:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:59:13.409821 543705 memory.go:191] Add success.
I0320 06:59:13.409824 543705 cpu.go:282] Add success.
W0320 06:59:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 06:59:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 06:59:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 06:59:13.420166 543705 net.go:648] Add success.
I0320 06:59:13.422835 543705 net.go:770] primary dev: ETH0
I0320 06:59:13.422848 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:59:13.422860 543705 net.go:698] Add success.
I0320 06:59:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 06:59:14.455136 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 06:59:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0320 06:59:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0320 06:59:14.456596 543705 disk_worker.go:494] system disk:vda1
I0320 06:59:14.456630 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 06:59:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 06:59:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:59:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:59:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 06:59:16.472372 543705 disk_local_worker.go:436] Get disk info: []
I0320 06:59:23.121672 543705 disk_info.go:125] begin check local disk info of client
I0320 06:59:23.124169 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 06:59:23.124175 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fafc0 0xc0001fb000]
E0320 06:59:23.407519 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:59:23.407533 543705 memory.go:184] no items to output this cycle
I0320 06:59:23.407535 543705 cpu.go:275] no items to output this cycle
E0320 06:59:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:59:33.409781 543705 memory.go:184] no items to output this cycle
I0320 06:59:33.409787 543705 cpu.go:275] no items to output this cycle
E0320 06:59:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:59:43.409792 543705 memory.go:191] Add success.
I0320 06:59:43.409793 543705 cpu.go:282] Add success.
I0320 06:59:43.419980 543705 net.go:648] Add success.
I0320 06:59:43.422773 543705 net.go:770] primary dev: ETH0
I0320 06:59:43.422786 543705 net.go:802] Send network stats successfully!,count is 6
I0320 06:59:43.422797 543705 net.go:698] Add success.
I0320 06:59:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 06:59:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 06:59:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 06:59:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 06:59:53.409798 543705 memory.go:184] no items to output this cycle
I0320 06:59:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 07:00:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:00:03.409789 543705 memory.go:184] no items to output this cycle
I0320 07:00:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 07:00:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:00:13.409807 543705 memory.go:191] Add success.
I0320 07:00:13.409811 543705 cpu.go:282] Add success.
W0320 07:00:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:00:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:00:13.409851 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:00:13.420357 543705 net.go:648] Add success.
I0320 07:00:13.423051 543705 net.go:770] primary dev: ETH0
I0320 07:00:13.423066 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:00:13.423080 543705 net.go:698] Add success.
I0320 07:00:13.469202 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"72f3779f-3b9b-40fe-aada-4960bdf1c08d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 07:00:13.469236 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 07:00:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:00:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:00:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 07:00:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:00:14.456747 543705 disk_worker.go:494] system disk:vda1
I0320 07:00:14.456781 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:00:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:00:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:00:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:00:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:00:16.472381 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:00:23.125678 543705 disk_info.go:125] begin check local disk info of client
I0320 07:00:23.128142 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:00:23.128148 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001c7180 0xc0001c71c0]
E0320 07:00:23.407612 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:00:23.407624 543705 memory.go:184] no items to output this cycle
I0320 07:00:23.407661 543705 cpu.go:275] no items to output this cycle
E0320 07:00:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:00:33.409803 543705 memory.go:184] no items to output this cycle
I0320 07:00:33.409813 543705 cpu.go:275] no items to output this cycle
I0320 07:00:38.213734 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 07:00:38.213741 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 07:00:43.409824 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:00:43.410735 543705 memory.go:191] Add success.
I0320 07:00:43.409885 543705 cpu.go:282] Add success.
I0320 07:00:43.420451 543705 net.go:648] Add success.
I0320 07:00:43.423106 543705 net.go:770] primary dev: ETH0
I0320 07:00:43.423120 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:00:43.423132 543705 net.go:698] Add success.
I0320 07:00:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:00:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:00:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:00:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:00:53.409779 543705 memory.go:184] no items to output this cycle
I0320 07:00:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 07:01:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:01:03.409778 543705 memory.go:184] no items to output this cycle
I0320 07:01:03.409784 543705 cpu.go:275] no items to output this cycle
E0320 07:01:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:01:13.409805 543705 memory.go:191] Add success.
I0320 07:01:13.409817 543705 cpu.go:282] Add success.
W0320 07:01:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:01:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:01:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:01:13.420113 543705 net.go:648] Add success.
I0320 07:01:13.422952 543705 net.go:770] primary dev: ETH0
I0320 07:01:13.422964 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:01:13.422977 543705 net.go:698] Add success.
I0320 07:01:14.454982 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:01:14.455111 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:01:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0320 07:01:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:01:14.456500 543705 disk_worker.go:494] system disk:vda1
I0320 07:01:14.456546 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:01:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:01:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:01:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:01:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:01:16.472357 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:01:23.129672 543705 disk_info.go:125] begin check local disk info of client
I0320 07:01:23.132064 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:01:23.132069 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b480 0xc00007b4c0]
E0320 07:01:23.407546 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:01:23.407564 543705 memory.go:184] no items to output this cycle
I0320 07:01:23.407568 543705 cpu.go:275] no items to output this cycle
E0320 07:01:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:01:33.409797 543705 memory.go:184] no items to output this cycle
I0320 07:01:33.409809 543705 cpu.go:275] no items to output this cycle
E0320 07:01:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:01:43.409795 543705 memory.go:191] Add success.
I0320 07:01:43.409815 543705 cpu.go:282] Add success.
I0320 07:01:43.419952 543705 net.go:648] Add success.
I0320 07:01:43.422699 543705 net.go:770] primary dev: ETH0
I0320 07:01:43.422712 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:01:43.422724 543705 net.go:698] Add success.
I0320 07:01:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:01:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:01:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:01:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:01:53.409859 543705 memory.go:184] no items to output this cycle
I0320 07:01:53.409967 543705 cpu.go:275] no items to output this cycle
E0320 07:02:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:02:03.409777 543705 memory.go:184] no items to output this cycle
I0320 07:02:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 07:02:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:02:13.409809 543705 memory.go:191] Add success.
I0320 07:02:13.409813 543705 cpu.go:282] Add success.
W0320 07:02:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:02:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:02:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:02:13.420160 543705 net.go:648] Add success.
I0320 07:02:13.422819 543705 net.go:770] primary dev: ETH0
I0320 07:02:13.422832 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:02:13.422845 543705 net.go:698] Add success.
W0320 07:02:14.455168 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:02:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 07:02:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:02:14.456796 543705 disk_worker.go:494] system disk:vda1
I0320 07:02:14.456835 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 07:02:14.457132 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 07:02:14.457140 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 07:02:14.457144 543705 custom_config.go:64] query custom config with name: gpu
E0320 07:02:15.456841 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 07:02:15.456851 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:02:16.457939 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 07:02:16.457949 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 07:02:16.457992 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:02:16.458009 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:02:16.472356 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:02:23.133671 543705 disk_info.go:125] begin check local disk info of client
I0320 07:02:23.136079 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:02:23.136086 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003c81c0 0xc0003c8200]
E0320 07:02:23.407520 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:02:23.407537 543705 memory.go:184] no items to output this cycle
I0320 07:02:23.407557 543705 cpu.go:275] no items to output this cycle
E0320 07:02:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:02:33.409786 543705 memory.go:184] no items to output this cycle
I0320 07:02:33.409807 543705 cpu.go:275] no items to output this cycle
E0320 07:02:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:02:43.409801 543705 memory.go:191] Add success.
I0320 07:02:43.409810 543705 cpu.go:282] Add success.
I0320 07:02:43.419889 543705 net.go:648] Add success.
I0320 07:02:43.422715 543705 net.go:770] primary dev: ETH0
I0320 07:02:43.422729 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:02:43.422753 543705 net.go:698] Add success.
I0320 07:02:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:02:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:02:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:02:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:02:53.409806 543705 memory.go:184] no items to output this cycle
I0320 07:02:53.409818 543705 cpu.go:275] no items to output this cycle
E0320 07:03:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:03:03.409798 543705 memory.go:184] no items to output this cycle
I0320 07:03:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 07:03:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:03:13.409812 543705 memory.go:191] Add success.
I0320 07:03:13.409815 543705 cpu.go:282] Add success.
W0320 07:03:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:03:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:03:13.409856 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:03:13.420250 543705 net.go:648] Add success.
I0320 07:03:13.423191 543705 net.go:770] primary dev: ETH0
I0320 07:03:13.423206 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:03:13.423220 543705 net.go:698] Add success.
I0320 07:03:13.486472 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"31ed8cf3-c885-4b6a-8d7c-031e3a1b0a3d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 07:03:13.486506 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 07:03:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:03:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:03:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 07:03:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:03:14.456728 543705 disk_worker.go:494] system disk:vda1
I0320 07:03:14.456760 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:03:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:03:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:03:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:03:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:03:16.472401 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:03:23.137673 543705 disk_info.go:125] begin check local disk info of client
I0320 07:03:23.140172 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:03:23.140178 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037eac0 0xc00037eb00]
E0320 07:03:23.407570 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:03:23.407583 543705 memory.go:184] no items to output this cycle
I0320 07:03:23.407612 543705 cpu.go:275] no items to output this cycle
E0320 07:03:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:03:33.409786 543705 memory.go:184] no items to output this cycle
I0320 07:03:33.409805 543705 cpu.go:275] no items to output this cycle
I0320 07:03:38.217736 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 07:03:38.217743 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 07:03:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:03:43.410685 543705 memory.go:191] Add success.
I0320 07:03:43.409839 543705 cpu.go:282] Add success.
I0320 07:03:43.420428 543705 net.go:648] Add success.
I0320 07:03:43.423117 543705 net.go:770] primary dev: ETH0
I0320 07:03:43.423131 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:03:43.423145 543705 net.go:698] Add success.
I0320 07:03:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:03:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:03:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:03:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:03:53.409780 543705 memory.go:184] no items to output this cycle
I0320 07:03:53.409783 543705 cpu.go:275] no items to output this cycle
E0320 07:04:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:04:03.409868 543705 memory.go:184] no items to output this cycle
I0320 07:04:03.409974 543705 cpu.go:275] no items to output this cycle
E0320 07:04:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:04:13.409826 543705 memory.go:191] Add success.
I0320 07:04:13.409836 543705 cpu.go:282] Add success.
W0320 07:04:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:04:13.409884 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:04:13.409888 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:04:13.420252 543705 net.go:648] Add success.
I0320 07:04:13.422999 543705 net.go:770] primary dev: ETH0
I0320 07:04:13.423014 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:04:13.423028 543705 net.go:698] Add success.
I0320 07:04:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:04:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:04:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0320 07:04:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:04:14.456524 543705 disk_worker.go:494] system disk:vda1
I0320 07:04:14.456567 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:04:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:04:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:04:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:04:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:04:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:04:23.141674 543705 disk_info.go:125] begin check local disk info of client
I0320 07:04:23.144138 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:04:23.144144 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fab40 0xc0001fab80]
E0320 07:04:23.407532 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:04:23.407547 543705 memory.go:184] no items to output this cycle
I0320 07:04:23.407563 543705 cpu.go:275] no items to output this cycle
E0320 07:04:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:04:33.409772 543705 memory.go:184] no items to output this cycle
I0320 07:04:33.409807 543705 cpu.go:275] no items to output this cycle
E0320 07:04:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:04:43.409817 543705 memory.go:191] Add success.
I0320 07:04:43.409829 543705 cpu.go:282] Add success.
I0320 07:04:43.419989 543705 net.go:648] Add success.
I0320 07:04:43.422895 543705 net.go:770] primary dev: ETH0
I0320 07:04:43.422909 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:04:43.422921 543705 net.go:698] Add success.
I0320 07:04:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:04:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:04:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:04:53.410369 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:04:53.410384 543705 memory.go:184] no items to output this cycle
I0320 07:04:53.410386 543705 cpu.go:275] no items to output this cycle
E0320 07:05:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:05:03.409780 543705 memory.go:184] no items to output this cycle
I0320 07:05:03.409781 543705 cpu.go:275] no items to output this cycle
E0320 07:05:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:05:13.409806 543705 memory.go:191] Add success.
I0320 07:05:13.409807 543705 cpu.go:282] Add success.
W0320 07:05:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:05:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:05:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:05:13.420192 543705 net.go:648] Add success.
I0320 07:05:13.422933 543705 net.go:770] primary dev: ETH0
I0320 07:05:13.422946 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:05:13.422959 543705 net.go:698] Add success.
I0320 07:05:14.454954 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:05:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:05:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 07:05:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:05:14.456590 543705 disk_worker.go:494] system disk:vda1
I0320 07:05:14.456624 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:05:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:05:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:05:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:05:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:05:16.472392 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:05:23.145673 543705 disk_info.go:125] begin check local disk info of client
I0320 07:05:23.148115 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:05:23.148121 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037fc80 0xc00037fcc0]
E0320 07:05:23.407510 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:05:23.407523 543705 memory.go:184] no items to output this cycle
I0320 07:05:23.407553 543705 cpu.go:275] no items to output this cycle
E0320 07:05:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:05:33.409803 543705 memory.go:184] no items to output this cycle
I0320 07:05:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 07:05:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:05:43.409776 543705 memory.go:191] Add success.
I0320 07:05:43.409800 543705 cpu.go:282] Add success.
I0320 07:05:43.420035 543705 net.go:648] Add success.
I0320 07:05:43.423724 543705 net.go:770] primary dev: ETH0
I0320 07:05:43.423739 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:05:43.423753 543705 net.go:698] Add success.
I0320 07:05:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:05:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:05:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:05:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:05:53.409780 543705 memory.go:184] no items to output this cycle
I0320 07:05:53.409782 543705 cpu.go:275] no items to output this cycle
E0320 07:06:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:06:03.409795 543705 memory.go:184] no items to output this cycle
I0320 07:06:03.409808 543705 cpu.go:275] no items to output this cycle
E0320 07:06:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:06:13.409907 543705 cpu.go:282] Add success.
I0320 07:06:13.409923 543705 memory.go:191] Add success.
W0320 07:06:13.409954 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:06:13.409979 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:06:13.409983 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:06:13.419725 543705 net.go:648] Add success.
I0320 07:06:13.422371 543705 net.go:770] primary dev: ETH0
I0320 07:06:13.422384 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:06:13.422396 543705 net.go:698] Add success.
I0320 07:06:13.468255 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0093675a-2196-4e9d-b85f-313177df5c30","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 07:06:13.468286 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 07:06:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:06:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:06:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 07:06:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:06:14.456714 543705 disk_worker.go:494] system disk:vda1
I0320 07:06:14.456742 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:06:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:06:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:06:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:06:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:06:16.472375 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:06:23.149673 543705 disk_info.go:125] begin check local disk info of client
I0320 07:06:23.152116 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:06:23.152122 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037fd40 0xc00037fd80]
E0320 07:06:23.407480 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:06:23.407497 543705 memory.go:184] no items to output this cycle
I0320 07:06:23.407511 543705 cpu.go:275] no items to output this cycle
E0320 07:06:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:06:33.409801 543705 memory.go:184] no items to output this cycle
I0320 07:06:33.409815 543705 cpu.go:275] no items to output this cycle
I0320 07:06:38.221732 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 07:06:38.221739 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 07:06:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:06:43.410755 543705 memory.go:191] Add success.
I0320 07:06:43.409824 543705 cpu.go:282] Add success.
I0320 07:06:43.420433 543705 net.go:648] Add success.
I0320 07:06:43.423320 543705 net.go:770] primary dev: ETH0
I0320 07:06:43.423332 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:06:43.423344 543705 net.go:698] Add success.
I0320 07:06:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:06:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:06:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:06:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:06:53.409774 543705 memory.go:184] no items to output this cycle
I0320 07:06:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 07:07:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:07:03.409771 543705 memory.go:184] no items to output this cycle
I0320 07:07:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 07:07:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:07:13.409817 543705 memory.go:191] Add success.
I0320 07:07:13.409826 543705 cpu.go:282] Add success.
W0320 07:07:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:07:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:07:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:07:13.420464 543705 net.go:648] Add success.
I0320 07:07:13.423850 543705 net.go:770] primary dev: ETH0
I0320 07:07:13.423862 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:07:13.423874 543705 net.go:698] Add success.
I0320 07:07:13.452771 543705 event_worker.go:152] Polling the log file for events...
W0320 07:07:14.455097 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:07:14.455157 543705 disk_worker.go:708] disk space is not compliant
W0320 07:07:14.455160 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:07:14.456536 543705 disk_worker.go:494] system disk:vda1
I0320 07:07:14.456562 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 07:07:14.457386 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 07:07:14.457393 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 07:07:14.457398 543705 custom_config.go:64] query custom config with name: gpu
E0320 07:07:15.456793 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 07:07:15.456803 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:07:16.457927 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 07:07:16.457926 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 07:07:16.457981 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:07:16.458000 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:07:16.472323 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:07:23.153673 543705 disk_info.go:125] begin check local disk info of client
I0320 07:07:23.156111 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:07:23.156117 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007acc0 0xc00007ad00]
E0320 07:07:23.407444 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:07:23.407456 543705 memory.go:184] no items to output this cycle
I0320 07:07:23.407477 543705 cpu.go:275] no items to output this cycle
E0320 07:07:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:07:33.409809 543705 memory.go:184] no items to output this cycle
I0320 07:07:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 07:07:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:07:43.409785 543705 memory.go:191] Add success.
I0320 07:07:43.409787 543705 cpu.go:282] Add success.
I0320 07:07:43.419853 543705 net.go:648] Add success.
I0320 07:07:43.422494 543705 net.go:770] primary dev: ETH0
I0320 07:07:43.422507 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:07:43.422519 543705 net.go:698] Add success.
I0320 07:07:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:07:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:07:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:07:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:07:53.409767 543705 memory.go:184] no items to output this cycle
I0320 07:07:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 07:08:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:08:03.409772 543705 memory.go:184] no items to output this cycle
I0320 07:08:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 07:08:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:08:13.409829 543705 memory.go:191] Add success.
I0320 07:08:13.409831 543705 cpu.go:282] Add success.
W0320 07:08:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:08:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:08:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:08:13.420349 543705 net.go:648] Add success.
I0320 07:08:13.423080 543705 net.go:770] primary dev: ETH0
I0320 07:08:13.423092 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:08:13.423104 543705 net.go:698] Add success.
I0320 07:08:14.454943 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:08:14.455196 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:08:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0320 07:08:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:08:14.456605 543705 disk_worker.go:494] system disk:vda1
I0320 07:08:14.456633 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:08:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:08:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:08:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:08:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:08:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:08:23.157674 543705 disk_info.go:125] begin check local disk info of client
I0320 07:08:23.160144 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:08:23.160164 543705 disk_info.go:196] parse disk info done, disk is : [0xc000465400 0xc000465440]
E0320 07:08:23.407441 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:08:23.407453 543705 memory.go:184] no items to output this cycle
I0320 07:08:23.407489 543705 cpu.go:275] no items to output this cycle
E0320 07:08:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:08:33.409810 543705 memory.go:184] no items to output this cycle
I0320 07:08:33.409817 543705 cpu.go:275] no items to output this cycle
E0320 07:08:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:08:43.409795 543705 memory.go:191] Add success.
I0320 07:08:43.409814 543705 cpu.go:282] Add success.
I0320 07:08:43.419895 543705 net.go:648] Add success.
I0320 07:08:43.422723 543705 net.go:770] primary dev: ETH0
I0320 07:08:43.422737 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:08:43.422752 543705 net.go:698] Add success.
I0320 07:08:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:08:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:08:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:08:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:08:53.409804 543705 memory.go:184] no items to output this cycle
I0320 07:08:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 07:09:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:09:03.409771 543705 memory.go:184] no items to output this cycle
I0320 07:09:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 07:09:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:09:13.409820 543705 memory.go:191] Add success.
I0320 07:09:13.409824 543705 cpu.go:282] Add success.
W0320 07:09:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:09:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:09:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:09:13.420066 543705 net.go:648] Add success.
I0320 07:09:13.423215 543705 net.go:770] primary dev: ETH0
I0320 07:09:13.423230 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:09:13.423243 543705 net.go:698] Add success.
I0320 07:09:13.463172 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"095674d6-e40a-4f8d-b0b2-1d3accab5bbf","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 07:09:13.463205 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 07:09:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:09:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:09:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 07:09:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:09:14.456524 543705 disk_worker.go:494] system disk:vda1
I0320 07:09:14.456551 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:09:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:09:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:09:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:09:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:09:16.472388 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:09:23.161681 543705 disk_info.go:125] begin check local disk info of client
I0320 07:09:23.164091 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:09:23.164098 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007ab80 0xc00007abc0]
E0320 07:09:23.408469 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:09:23.408485 543705 memory.go:184] no items to output this cycle
I0320 07:09:23.408497 543705 cpu.go:275] no items to output this cycle
E0320 07:09:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:09:33.409809 543705 memory.go:184] no items to output this cycle
I0320 07:09:33.409822 543705 cpu.go:275] no items to output this cycle
I0320 07:09:38.225739 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 07:09:38.225746 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 07:09:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:09:43.410764 543705 memory.go:191] Add success.
I0320 07:09:43.409795 543705 cpu.go:282] Add success.
I0320 07:09:43.420553 543705 net.go:648] Add success.
I0320 07:09:43.423607 543705 net.go:770] primary dev: ETH0
I0320 07:09:43.423620 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:09:43.423633 543705 net.go:698] Add success.
I0320 07:09:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:09:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:09:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:09:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:09:53.409783 543705 memory.go:184] no items to output this cycle
I0320 07:09:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 07:10:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:10:03.409787 543705 memory.go:184] no items to output this cycle
I0320 07:10:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 07:10:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:10:13.409803 543705 memory.go:191] Add success.
I0320 07:10:13.409804 543705 cpu.go:282] Add success.
W0320 07:10:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:10:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:10:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:10:13.420089 543705 net.go:648] Add success.
I0320 07:10:13.423209 543705 net.go:770] primary dev: ETH0
I0320 07:10:13.423223 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:10:13.423237 543705 net.go:698] Add success.
I0320 07:10:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:10:14.455098 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:10:14.455160 543705 disk_worker.go:708] disk space is not compliant
W0320 07:10:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:10:14.456498 543705 disk_worker.go:494] system disk:vda1
I0320 07:10:14.456630 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:10:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:10:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:10:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:10:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:10:16.472373 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:10:23.165673 543705 disk_info.go:125] begin check local disk info of client
I0320 07:10:23.168144 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:10:23.168150 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034ac40 0xc00034ac80]
E0320 07:10:23.407523 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:10:23.407540 543705 memory.go:184] no items to output this cycle
I0320 07:10:23.407552 543705 cpu.go:275] no items to output this cycle
E0320 07:10:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:10:33.409782 543705 memory.go:184] no items to output this cycle
I0320 07:10:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 07:10:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:10:43.409794 543705 memory.go:191] Add success.
I0320 07:10:43.409794 543705 cpu.go:282] Add success.
I0320 07:10:43.419885 543705 net.go:648] Add success.
I0320 07:10:43.422849 543705 net.go:770] primary dev: ETH0
I0320 07:10:43.422862 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:10:43.422875 543705 net.go:698] Add success.
I0320 07:10:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:10:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:10:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:10:53.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:10:53.409762 543705 memory.go:184] no items to output this cycle
I0320 07:10:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 07:11:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:11:03.409804 543705 memory.go:184] no items to output this cycle
I0320 07:11:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 07:11:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:11:13.409824 543705 memory.go:191] Add success.
I0320 07:11:13.409829 543705 cpu.go:282] Add success.
W0320 07:11:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:11:13.409875 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:11:13.409878 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:11:13.420175 543705 net.go:648] Add success.
I0320 07:11:13.423401 543705 net.go:770] primary dev: ETH0
I0320 07:11:13.423414 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:11:13.423426 543705 net.go:698] Add success.
I0320 07:11:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:11:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:11:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0320 07:11:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:11:14.456604 543705 disk_worker.go:494] system disk:vda1
I0320 07:11:14.456634 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:11:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:11:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:11:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:11:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:11:16.472376 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:11:23.169669 543705 disk_info.go:125] begin check local disk info of client
I0320 07:11:23.172105 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:11:23.172111 543705 disk_info.go:196] parse disk info done, disk is : [0xc000461cc0 0xc000461d00]
E0320 07:11:23.408442 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:11:23.408457 543705 memory.go:184] no items to output this cycle
I0320 07:11:23.408470 543705 cpu.go:275] no items to output this cycle
E0320 07:11:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:11:33.409800 543705 memory.go:184] no items to output this cycle
I0320 07:11:33.409804 543705 cpu.go:275] no items to output this cycle
E0320 07:11:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:11:43.409791 543705 memory.go:191] Add success.
I0320 07:11:43.409796 543705 cpu.go:282] Add success.
I0320 07:11:43.419886 543705 net.go:648] Add success.
I0320 07:11:43.422969 543705 net.go:770] primary dev: ETH0
I0320 07:11:43.422982 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:11:43.422995 543705 net.go:698] Add success.
I0320 07:11:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:11:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:11:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:11:53.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:11:53.409759 543705 memory.go:184] no items to output this cycle
I0320 07:11:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 07:12:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:12:03.409784 543705 cpu.go:275] no items to output this cycle
I0320 07:12:03.409787 543705 memory.go:184] no items to output this cycle
E0320 07:12:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:12:13.409826 543705 memory.go:191] Add success.
I0320 07:12:13.409829 543705 cpu.go:282] Add success.
W0320 07:12:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:12:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:12:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:12:13.420162 543705 net.go:648] Add success.
I0320 07:12:13.423220 543705 net.go:770] primary dev: ETH0
I0320 07:12:13.423233 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:12:13.423245 543705 net.go:698] Add success.
I0320 07:12:13.899242 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"016e1c3b-11a4-44c9-9fc8-f3f8f5c148c7","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 07:12:13.899277 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 07:12:14.454842 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:12:14.454912 543705 disk_worker.go:708] disk space is not compliant
W0320 07:12:14.454916 543705 disk_worker.go:728] disk inode is not compliant
E0320 07:12:14.455663 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 07:12:14.455673 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 07:12:14.455678 543705 custom_config.go:64] query custom config with name: gpu
I0320 07:12:14.456489 543705 disk_worker.go:494] system disk:vda1
I0320 07:12:14.456519 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 07:12:15.456843 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 07:12:15.456852 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:12:16.457918 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 07:12:16.457926 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 07:12:16.457972 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:12:16.457989 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:12:16.472321 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:12:23.173676 543705 disk_info.go:125] begin check local disk info of client
I0320 07:12:23.176053 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:12:23.176059 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002780c0 0xc000278100]
E0320 07:12:23.408368 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:12:23.408383 543705 memory.go:184] no items to output this cycle
I0320 07:12:23.408397 543705 cpu.go:275] no items to output this cycle
E0320 07:12:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:12:33.409770 543705 memory.go:184] no items to output this cycle
I0320 07:12:33.409806 543705 cpu.go:275] no items to output this cycle
I0320 07:12:38.229747 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 07:12:38.229753 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 07:12:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:12:43.410676 543705 memory.go:191] Add success.
I0320 07:12:43.409809 543705 cpu.go:282] Add success.
I0320 07:12:43.420373 543705 net.go:648] Add success.
I0320 07:12:43.422955 543705 net.go:770] primary dev: ETH0
I0320 07:12:43.422967 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:12:43.422979 543705 net.go:698] Add success.
I0320 07:12:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:12:46.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:12:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:12:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:12:53.409796 543705 memory.go:184] no items to output this cycle
I0320 07:12:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 07:13:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:13:03.409806 543705 memory.go:184] no items to output this cycle
I0320 07:13:03.409815 543705 cpu.go:275] no items to output this cycle
E0320 07:13:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:13:13.409822 543705 memory.go:191] Add success.
I0320 07:13:13.409831 543705 cpu.go:282] Add success.
W0320 07:13:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:13:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:13:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:13:13.420143 543705 net.go:648] Add success.
I0320 07:13:13.422778 543705 net.go:770] primary dev: ETH0
I0320 07:13:13.422795 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:13:13.422809 543705 net.go:698] Add success.
I0320 07:13:14.454222 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:13:14.454422 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:13:14.454432 543705 disk_worker.go:708] disk space is not compliant
W0320 07:13:14.454435 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:13:14.455829 543705 disk_worker.go:494] system disk:vda1
I0320 07:13:14.455859 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:13:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:13:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:13:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:13:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:13:16.472402 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:13:23.177677 543705 disk_info.go:125] begin check local disk info of client
I0320 07:13:23.180169 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:13:23.180177 543705 disk_info.go:196] parse disk info done, disk is : [0xc000461480 0xc0004614c0]
E0320 07:13:23.408435 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:13:23.408447 543705 memory.go:184] no items to output this cycle
I0320 07:13:23.408472 543705 cpu.go:275] no items to output this cycle
E0320 07:13:33.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:13:33.409812 543705 memory.go:184] no items to output this cycle
I0320 07:13:33.409823 543705 cpu.go:275] no items to output this cycle
E0320 07:13:43.409885 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:13:43.410008 543705 cpu.go:282] Add success.
I0320 07:13:43.410008 543705 memory.go:191] Add success.
I0320 07:13:43.419720 543705 net.go:648] Add success.
I0320 07:13:43.422564 543705 net.go:770] primary dev: ETH0
I0320 07:13:43.422579 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:13:43.422593 543705 net.go:698] Add success.
I0320 07:13:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:13:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:13:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:13:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:13:53.409792 543705 memory.go:184] no items to output this cycle
I0320 07:13:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 07:14:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:14:03.409781 543705 memory.go:184] no items to output this cycle
I0320 07:14:03.409802 543705 cpu.go:275] no items to output this cycle
E0320 07:14:13.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:14:13.409829 543705 memory.go:191] Add success.
I0320 07:14:13.409834 543705 cpu.go:282] Add success.
W0320 07:14:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:14:13.409878 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:14:13.409882 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:14:13.420109 543705 net.go:648] Add success.
I0320 07:14:13.422989 543705 net.go:770] primary dev: ETH0
I0320 07:14:13.423003 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:14:13.423015 543705 net.go:698] Add success.
I0320 07:14:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:14:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:14:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 07:14:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:14:14.456512 543705 disk_worker.go:494] system disk:vda1
I0320 07:14:14.456556 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:14:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:14:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:14:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:14:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:14:16.472398 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:14:23.181672 543705 disk_info.go:125] begin check local disk info of client
I0320 07:14:23.184112 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:14:23.184118 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa7c0 0xc0001fa800]
E0320 07:14:23.407501 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:14:23.407514 543705 memory.go:184] no items to output this cycle
I0320 07:14:23.407542 543705 cpu.go:275] no items to output this cycle
E0320 07:14:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:14:33.409787 543705 memory.go:184] no items to output this cycle
I0320 07:14:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 07:14:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:14:43.409820 543705 memory.go:191] Add success.
I0320 07:14:43.409828 543705 cpu.go:282] Add success.
I0320 07:14:43.420164 543705 net.go:648] Add success.
I0320 07:14:43.423319 543705 net.go:770] primary dev: ETH0
I0320 07:14:43.423333 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:14:43.423344 543705 net.go:698] Add success.
I0320 07:14:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:14:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:14:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:14:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:14:53.409793 543705 memory.go:184] no items to output this cycle
I0320 07:14:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 07:15:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:15:03.409788 543705 memory.go:184] no items to output this cycle
I0320 07:15:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 07:15:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:15:13.409815 543705 memory.go:191] Add success.
I0320 07:15:13.409818 543705 cpu.go:282] Add success.
W0320 07:15:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:15:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:15:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:15:13.420105 543705 net.go:648] Add success.
I0320 07:15:13.422872 543705 net.go:770] primary dev: ETH0
I0320 07:15:13.422887 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:15:13.422901 543705 net.go:698] Add success.
I0320 07:15:13.473819 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"245b1b86-fac8-4817-91f4-f5e56db132f7","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 07:15:13.473852 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 07:15:14.454981 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:15:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:15:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0320 07:15:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:15:14.456535 543705 disk_worker.go:494] system disk:vda1
I0320 07:15:14.456586 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:15:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:15:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:15:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:15:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:15:16.472403 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:15:23.185674 543705 disk_info.go:125] begin check local disk info of client
I0320 07:15:23.188122 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:15:23.188129 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003760c0 0xc000376100]
E0320 07:15:23.408379 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:15:23.408396 543705 memory.go:184] no items to output this cycle
I0320 07:15:23.408409 543705 cpu.go:275] no items to output this cycle
E0320 07:15:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:15:33.409815 543705 memory.go:184] no items to output this cycle
I0320 07:15:33.409829 543705 cpu.go:275] no items to output this cycle
I0320 07:15:38.233738 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 07:15:38.233744 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 07:15:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:15:43.410853 543705 memory.go:191] Add success.
I0320 07:15:43.409916 543705 cpu.go:282] Add success.
I0320 07:15:43.419762 543705 net.go:648] Add success.
I0320 07:15:43.422991 543705 net.go:770] primary dev: ETH0
I0320 07:15:43.423006 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:15:43.423019 543705 net.go:698] Add success.
I0320 07:15:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:15:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:15:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:15:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:15:53.409767 543705 memory.go:184] no items to output this cycle
I0320 07:15:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 07:16:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:16:03.409776 543705 memory.go:184] no items to output this cycle
I0320 07:16:03.409808 543705 cpu.go:275] no items to output this cycle
E0320 07:16:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:16:13.409836 543705 memory.go:191] Add success.
I0320 07:16:13.409839 543705 cpu.go:282] Add success.
W0320 07:16:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:16:13.409886 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:16:13.409890 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:16:13.420121 543705 net.go:648] Add success.
I0320 07:16:13.423000 543705 net.go:770] primary dev: ETH0
I0320 07:16:13.423013 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:16:13.423026 543705 net.go:698] Add success.
I0320 07:16:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:16:14.455188 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:16:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0320 07:16:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:16:14.456588 543705 disk_worker.go:494] system disk:vda1
I0320 07:16:14.456619 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:16:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:16:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:16:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:16:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:16:16.472443 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:16:23.189677 543705 disk_info.go:125] begin check local disk info of client
I0320 07:16:23.192121 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:16:23.192127 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fc080 0xc0001fc0c0]
E0320 07:16:23.408349 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:16:23.408364 543705 memory.go:184] no items to output this cycle
I0320 07:16:23.408382 543705 cpu.go:275] no items to output this cycle
E0320 07:16:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:16:33.409785 543705 memory.go:184] no items to output this cycle
I0320 07:16:33.409787 543705 cpu.go:275] no items to output this cycle
E0320 07:16:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:16:43.409800 543705 memory.go:191] Add success.
I0320 07:16:43.409817 543705 cpu.go:282] Add success.
I0320 07:16:43.419887 543705 net.go:648] Add success.
I0320 07:16:43.422775 543705 net.go:770] primary dev: ETH0
I0320 07:16:43.422788 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:16:43.422800 543705 net.go:698] Add success.
I0320 07:16:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:16:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:16:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:16:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:16:53.409782 543705 cpu.go:275] no items to output this cycle
I0320 07:16:53.409786 543705 memory.go:184] no items to output this cycle
E0320 07:17:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:17:03.409805 543705 memory.go:184] no items to output this cycle
I0320 07:17:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 07:17:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:17:13.409787 543705 memory.go:191] Add success.
I0320 07:17:13.409806 543705 cpu.go:282] Add success.
W0320 07:17:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:17:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:17:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:17:13.420143 543705 net.go:648] Add success.
I0320 07:17:13.423081 543705 net.go:770] primary dev: ETH0
I0320 07:17:13.423094 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:17:13.423107 543705 net.go:698] Add success.
I0320 07:17:13.453664 543705 event_worker.go:152] Polling the log file for events...
W0320 07:17:14.455149 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:17:14.455159 543705 disk_worker.go:708] disk space is not compliant
W0320 07:17:14.455162 543705 disk_worker.go:728] disk inode is not compliant
E0320 07:17:14.456941 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 07:17:14.456950 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 07:17:14.456956 543705 custom_config.go:64] query custom config with name: gpu
I0320 07:17:14.457012 543705 disk_worker.go:494] system disk:vda1
I0320 07:17:14.457044 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 07:17:15.456805 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 07:17:15.456814 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:17:16.457928 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 07:17:16.457941 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 07:17:16.457980 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:17:16.457997 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:17:16.472311 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:17:23.193671 543705 disk_info.go:125] begin check local disk info of client
I0320 07:17:23.196090 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:17:23.196095 543705 disk_info.go:196] parse disk info done, disk is : [0xc000368380 0xc0003683c0]
E0320 07:17:23.408297 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:17:23.408309 543705 memory.go:184] no items to output this cycle
I0320 07:17:23.408318 543705 cpu.go:275] no items to output this cycle
E0320 07:17:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:17:33.409790 543705 memory.go:184] no items to output this cycle
I0320 07:17:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 07:17:43.409851 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:17:43.409886 543705 memory.go:191] Add success.
I0320 07:17:43.409963 543705 cpu.go:282] Add success.
I0320 07:17:43.419717 543705 net.go:648] Add success.
I0320 07:17:43.422836 543705 net.go:770] primary dev: ETH0
I0320 07:17:43.422849 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:17:43.422861 543705 net.go:698] Add success.
I0320 07:17:46.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:17:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:17:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:17:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:17:53.409782 543705 cpu.go:275] no items to output this cycle
I0320 07:17:53.409785 543705 memory.go:184] no items to output this cycle
E0320 07:18:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:18:03.409787 543705 memory.go:184] no items to output this cycle
I0320 07:18:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 07:18:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:18:13.409829 543705 memory.go:191] Add success.
I0320 07:18:13.409835 543705 cpu.go:282] Add success.
W0320 07:18:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:18:13.409878 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:18:13.409882 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:18:13.420171 543705 net.go:648] Add success.
I0320 07:18:13.423058 543705 net.go:770] primary dev: ETH0
I0320 07:18:13.423071 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:18:13.423084 543705 net.go:698] Add success.
I0320 07:18:13.468628 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8b4b5ce4-2c4c-4680-951c-df05dc8eaf40","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 07:18:13.468660 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 07:18:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:18:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:18:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0320 07:18:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:18:14.456594 543705 disk_worker.go:494] system disk:vda1
I0320 07:18:14.456627 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:18:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:18:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:18:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:18:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:18:16.472375 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:18:23.197674 543705 disk_info.go:125] begin check local disk info of client
I0320 07:18:23.200186 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:18:23.200192 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5980 0xc0000c59c0]
E0320 07:18:23.408366 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:18:23.408378 543705 memory.go:184] no items to output this cycle
I0320 07:18:23.408401 543705 cpu.go:275] no items to output this cycle
E0320 07:18:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:18:33.409805 543705 memory.go:184] no items to output this cycle
I0320 07:18:33.409817 543705 cpu.go:275] no items to output this cycle
I0320 07:18:38.237737 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 07:18:38.237744 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 07:18:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:18:43.410878 543705 memory.go:191] Add success.
I0320 07:18:43.409819 543705 cpu.go:282] Add success.
I0320 07:18:43.420980 543705 net.go:648] Add success.
I0320 07:18:43.424180 543705 net.go:770] primary dev: ETH0
I0320 07:18:43.424192 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:18:43.424204 543705 net.go:698] Add success.
I0320 07:18:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:18:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:18:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:18:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:18:53.409779 543705 memory.go:184] no items to output this cycle
I0320 07:18:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 07:19:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:19:03.409773 543705 memory.go:184] no items to output this cycle
I0320 07:19:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 07:19:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:19:13.409791 543705 memory.go:191] Add success.
I0320 07:19:13.409792 543705 cpu.go:282] Add success.
W0320 07:19:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:19:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:19:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:19:13.420171 543705 net.go:648] Add success.
I0320 07:19:13.422954 543705 net.go:770] primary dev: ETH0
I0320 07:19:13.422968 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:19:13.422979 543705 net.go:698] Add success.
I0320 07:19:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:19:14.455183 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:19:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0320 07:19:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:19:14.456592 543705 disk_worker.go:494] system disk:vda1
I0320 07:19:14.456624 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:19:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:19:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:19:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:19:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:19:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:19:23.201676 543705 disk_info.go:125] begin check local disk info of client
I0320 07:19:23.204089 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:19:23.204095 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001c4040 0xc0001c4080]
E0320 07:19:23.408259 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:19:23.408282 543705 memory.go:184] no items to output this cycle
I0320 07:19:23.408294 543705 cpu.go:275] no items to output this cycle
E0320 07:19:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:19:33.409777 543705 memory.go:184] no items to output this cycle
I0320 07:19:33.409798 543705 cpu.go:275] no items to output this cycle
E0320 07:19:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:19:43.409774 543705 memory.go:191] Add success.
I0320 07:19:43.409808 543705 cpu.go:282] Add success.
I0320 07:19:43.420191 543705 net.go:648] Add success.
I0320 07:19:43.423339 543705 net.go:770] primary dev: ETH0
I0320 07:19:43.423353 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:19:43.423364 543705 net.go:698] Add success.
I0320 07:19:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:19:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:19:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:19:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:19:53.409799 543705 memory.go:184] no items to output this cycle
I0320 07:19:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 07:20:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:20:03.409785 543705 memory.go:184] no items to output this cycle
I0320 07:20:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 07:20:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:20:13.409823 543705 memory.go:191] Add success.
I0320 07:20:13.409844 543705 cpu.go:282] Add success.
W0320 07:20:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:20:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:20:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:20:13.420184 543705 net.go:648] Add success.
I0320 07:20:13.422999 543705 net.go:770] primary dev: ETH0
I0320 07:20:13.423013 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:20:13.423024 543705 net.go:698] Add success.
I0320 07:20:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:20:14.455193 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:20:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0320 07:20:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:20:14.456596 543705 disk_worker.go:494] system disk:vda1
I0320 07:20:14.456626 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:20:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:20:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:20:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:20:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:20:16.472441 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:20:23.205676 543705 disk_info.go:125] begin check local disk info of client
I0320 07:20:23.208081 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:20:23.208087 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa880 0xc0001aa8c0]
E0320 07:20:23.407524 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:20:23.407525 543705 cpu.go:275] no items to output this cycle
I0320 07:20:23.407538 543705 memory.go:184] no items to output this cycle
E0320 07:20:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:20:33.409782 543705 memory.go:184] no items to output this cycle
I0320 07:20:33.409785 543705 cpu.go:275] no items to output this cycle
E0320 07:20:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:20:43.409792 543705 memory.go:191] Add success.
I0320 07:20:43.409812 543705 cpu.go:282] Add success.
I0320 07:20:43.420102 543705 net.go:648] Add success.
I0320 07:20:43.423307 543705 net.go:770] primary dev: ETH0
I0320 07:20:43.423320 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:20:43.423332 543705 net.go:698] Add success.
I0320 07:20:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:20:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:20:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:20:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:20:53.409782 543705 memory.go:184] no items to output this cycle
I0320 07:20:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 07:21:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:21:03.409793 543705 memory.go:184] no items to output this cycle
I0320 07:21:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 07:21:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:21:13.409788 543705 memory.go:191] Add success.
I0320 07:21:13.409807 543705 cpu.go:282] Add success.
W0320 07:21:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:21:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:21:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:21:13.420163 543705 net.go:648] Add success.
I0320 07:21:13.422839 543705 net.go:770] primary dev: ETH0
I0320 07:21:13.422852 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:21:13.422863 543705 net.go:698] Add success.
I0320 07:21:13.559584 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a28403bb-719b-4905-9e66-6e27d204b7f1","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 07:21:13.559617 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 07:21:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:21:14.455162 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:21:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0320 07:21:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:21:14.456510 543705 disk_worker.go:494] system disk:vda1
I0320 07:21:14.456555 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:21:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:21:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:21:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:21:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:21:16.472427 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:21:23.209683 543705 disk_info.go:125] begin check local disk info of client
I0320 07:21:23.212092 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:21:23.212099 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb940 0xc0001fb980]
E0320 07:21:23.408220 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:21:23.408237 543705 memory.go:184] no items to output this cycle
I0320 07:21:23.408253 543705 cpu.go:275] no items to output this cycle
E0320 07:21:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:21:33.409786 543705 memory.go:184] no items to output this cycle
I0320 07:21:33.409793 543705 cpu.go:275] no items to output this cycle
I0320 07:21:38.241734 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 07:21:38.241741 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 07:21:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:21:43.410653 543705 memory.go:191] Add success.
I0320 07:21:43.409803 543705 cpu.go:282] Add success.
I0320 07:21:43.420360 543705 net.go:648] Add success.
I0320 07:21:43.423235 543705 net.go:770] primary dev: ETH0
I0320 07:21:43.423272 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:21:43.423289 543705 net.go:698] Add success.
I0320 07:21:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:21:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:21:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:21:53.409811 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:21:53.409834 543705 memory.go:184] no items to output this cycle
I0320 07:21:53.409847 543705 cpu.go:275] no items to output this cycle
E0320 07:22:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:22:03.409768 543705 memory.go:184] no items to output this cycle
I0320 07:22:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 07:22:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:22:13.409790 543705 memory.go:191] Add success.
I0320 07:22:13.409806 543705 cpu.go:282] Add success.
W0320 07:22:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:22:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:22:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:22:13.420252 543705 net.go:648] Add success.
I0320 07:22:13.422921 543705 net.go:770] primary dev: ETH0
I0320 07:22:13.422935 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:22:13.422950 543705 net.go:698] Add success.
W0320 07:22:14.455151 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:22:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0320 07:22:14.455164 543705 disk_worker.go:728] disk inode is not compliant
E0320 07:22:14.456918 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 07:22:14.456927 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 07:22:14.456933 543705 custom_config.go:64] query custom config with name: gpu
I0320 07:22:14.456990 543705 disk_worker.go:494] system disk:vda1
I0320 07:22:14.457019 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 07:22:15.456807 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 07:22:15.456816 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:22:16.457939 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 07:22:16.457939 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 07:22:16.457991 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:22:16.458011 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:22:16.472390 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:22:23.213679 543705 disk_info.go:125] begin check local disk info of client
I0320 07:22:23.216066 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:22:23.216073 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a1f00 0xc0004c3d00]
E0320 07:22:23.408192 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:22:23.408208 543705 memory.go:184] no items to output this cycle
I0320 07:22:23.408223 543705 cpu.go:275] no items to output this cycle
E0320 07:22:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:22:33.409805 543705 memory.go:184] no items to output this cycle
I0320 07:22:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 07:22:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:22:43.409781 543705 memory.go:191] Add success.
I0320 07:22:43.409804 543705 cpu.go:282] Add success.
I0320 07:22:43.419866 543705 net.go:648] Add success.
I0320 07:22:43.422831 543705 net.go:770] primary dev: ETH0
I0320 07:22:43.422845 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:22:43.422857 543705 net.go:698] Add success.
I0320 07:22:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:22:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:22:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:22:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:22:53.409784 543705 memory.go:184] no items to output this cycle
I0320 07:22:53.409784 543705 cpu.go:275] no items to output this cycle
E0320 07:23:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:23:03.409783 543705 memory.go:184] no items to output this cycle
I0320 07:23:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 07:23:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:23:13.409818 543705 memory.go:191] Add success.
I0320 07:23:13.409824 543705 cpu.go:282] Add success.
W0320 07:23:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:23:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:23:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:23:13.420158 543705 net.go:648] Add success.
I0320 07:23:13.422768 543705 net.go:770] primary dev: ETH0
I0320 07:23:13.422781 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:23:13.422793 543705 net.go:698] Add success.
I0320 07:23:14.454978 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:23:14.455152 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:23:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0320 07:23:14.455165 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:23:14.456489 543705 disk_worker.go:494] system disk:vda1
I0320 07:23:14.456534 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:23:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:23:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:23:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:23:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:23:16.472402 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:23:23.217678 543705 disk_info.go:125] begin check local disk info of client
I0320 07:23:23.220049 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:23:23.220055 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4300 0xc0000c4340]
E0320 07:23:23.407526 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:23:23.407543 543705 memory.go:184] no items to output this cycle
I0320 07:23:23.407567 543705 cpu.go:275] no items to output this cycle
E0320 07:23:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:23:33.409781 543705 memory.go:184] no items to output this cycle
I0320 07:23:33.409803 543705 cpu.go:275] no items to output this cycle
E0320 07:23:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:23:43.409809 543705 memory.go:191] Add success.
I0320 07:23:43.409819 543705 cpu.go:282] Add success.
I0320 07:23:43.419892 543705 net.go:648] Add success.
I0320 07:23:43.422760 543705 net.go:770] primary dev: ETH0
I0320 07:23:43.422775 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:23:43.422790 543705 net.go:698] Add success.
I0320 07:23:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:23:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:23:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:23:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:23:53.409802 543705 memory.go:184] no items to output this cycle
I0320 07:23:53.409816 543705 cpu.go:275] no items to output this cycle
E0320 07:24:03.409887 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:24:03.409906 543705 cpu.go:275] no items to output this cycle
I0320 07:24:03.409951 543705 memory.go:184] no items to output this cycle
E0320 07:24:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:24:13.409796 543705 memory.go:191] Add success.
I0320 07:24:13.409816 543705 cpu.go:282] Add success.
W0320 07:24:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:24:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:24:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:24:13.420121 543705 net.go:648] Add success.
I0320 07:24:13.423100 543705 net.go:770] primary dev: ETH0
I0320 07:24:13.423114 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:24:13.423125 543705 net.go:698] Add success.
I0320 07:24:13.595750 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0080df41-9d71-44c3-8b4a-0796f7cadf7c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 07:24:13.595782 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 07:24:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:24:14.455187 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:24:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 07:24:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:24:14.456579 543705 disk_worker.go:494] system disk:vda1
I0320 07:24:14.456622 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:24:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:24:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:24:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:24:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:24:16.472383 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:24:23.221674 543705 disk_info.go:125] begin check local disk info of client
I0320 07:24:23.224094 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:24:23.224100 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037fe00 0xc00037fe40]
E0320 07:24:23.407508 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:24:23.407522 543705 memory.go:184] no items to output this cycle
I0320 07:24:23.407536 543705 cpu.go:275] no items to output this cycle
E0320 07:24:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:24:33.409780 543705 memory.go:184] no items to output this cycle
I0320 07:24:33.409800 543705 cpu.go:275] no items to output this cycle
I0320 07:24:38.245733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 07:24:38.245739 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 07:24:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:24:43.410608 543705 memory.go:191] Add success.
I0320 07:24:43.409825 543705 cpu.go:282] Add success.
I0320 07:24:43.420438 543705 net.go:648] Add success.
I0320 07:24:43.422986 543705 net.go:770] primary dev: ETH0
I0320 07:24:43.423000 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:24:43.423012 543705 net.go:698] Add success.
I0320 07:24:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:24:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:24:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:24:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:24:53.409788 543705 memory.go:184] no items to output this cycle
I0320 07:24:53.409816 543705 cpu.go:275] no items to output this cycle
E0320 07:25:03.409878 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:25:03.409904 543705 memory.go:184] no items to output this cycle
I0320 07:25:03.409932 543705 cpu.go:275] no items to output this cycle
E0320 07:25:13.409803 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:25:13.409839 543705 memory.go:191] Add success.
I0320 07:25:13.409846 543705 cpu.go:282] Add success.
W0320 07:25:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:25:13.409887 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:25:13.409891 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:25:13.420330 543705 net.go:648] Add success.
I0320 07:25:13.423174 543705 net.go:770] primary dev: ETH0
I0320 07:25:13.423188 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:25:13.423200 543705 net.go:698] Add success.
I0320 07:25:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:25:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:25:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 07:25:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:25:14.456502 543705 disk_worker.go:494] system disk:vda1
I0320 07:25:14.456533 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:25:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:25:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:25:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:25:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:25:16.472364 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:25:23.225674 543705 disk_info.go:125] begin check local disk info of client
I0320 07:25:23.228156 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:25:23.228162 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4a80 0xc0000c4ac0]
E0320 07:25:23.408203 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:25:23.408217 543705 memory.go:184] no items to output this cycle
I0320 07:25:23.408222 543705 cpu.go:275] no items to output this cycle
E0320 07:25:33.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:25:33.409815 543705 memory.go:184] no items to output this cycle
I0320 07:25:33.409826 543705 cpu.go:275] no items to output this cycle
E0320 07:25:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:25:43.409794 543705 memory.go:191] Add success.
I0320 07:25:43.409820 543705 cpu.go:282] Add success.
I0320 07:25:43.419898 543705 net.go:648] Add success.
I0320 07:25:43.422723 543705 net.go:770] primary dev: ETH0
I0320 07:25:43.422737 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:25:43.422750 543705 net.go:698] Add success.
I0320 07:25:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:25:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:25:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:25:53.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:25:53.409806 543705 memory.go:184] no items to output this cycle
I0320 07:25:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 07:26:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:26:03.409785 543705 memory.go:184] no items to output this cycle
I0320 07:26:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 07:26:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:26:13.409815 543705 memory.go:191] Add success.
I0320 07:26:13.409820 543705 cpu.go:282] Add success.
W0320 07:26:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:26:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:26:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:26:13.420145 543705 net.go:648] Add success.
I0320 07:26:13.423085 543705 net.go:770] primary dev: ETH0
I0320 07:26:13.423098 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:26:13.423110 543705 net.go:698] Add success.
I0320 07:26:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:26:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:26:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 07:26:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:26:14.456568 543705 disk_worker.go:494] system disk:vda1
I0320 07:26:14.456600 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:26:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:26:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:26:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:26:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:26:16.472383 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:26:23.229675 543705 disk_info.go:125] begin check local disk info of client
I0320 07:26:23.232111 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:26:23.232118 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fbf40 0xc00034e000]
E0320 07:26:23.408139 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:26:23.408154 543705 memory.go:184] no items to output this cycle
I0320 07:26:23.408168 543705 cpu.go:275] no items to output this cycle
E0320 07:26:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:26:33.409794 543705 memory.go:184] no items to output this cycle
I0320 07:26:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 07:26:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:26:43.409804 543705 memory.go:191] Add success.
I0320 07:26:43.409808 543705 cpu.go:282] Add success.
I0320 07:26:43.419913 543705 net.go:648] Add success.
I0320 07:26:43.422471 543705 net.go:770] primary dev: ETH0
I0320 07:26:43.422486 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:26:43.422501 543705 net.go:698] Add success.
I0320 07:26:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:26:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:26:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:26:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:26:53.409782 543705 memory.go:184] no items to output this cycle
I0320 07:26:53.409784 543705 cpu.go:275] no items to output this cycle
E0320 07:27:03.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:27:03.409762 543705 memory.go:184] no items to output this cycle
I0320 07:27:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 07:27:13.409919 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:27:13.409933 543705 cpu.go:282] Add success.
I0320 07:27:13.410104 543705 memory.go:191] Add success.
W0320 07:27:13.410142 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:27:13.410159 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:27:13.410164 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:27:13.419716 543705 net.go:648] Add success.
I0320 07:27:13.422844 543705 net.go:770] primary dev: ETH0
I0320 07:27:13.422857 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:27:13.422868 543705 net.go:698] Add success.
I0320 07:27:13.429233 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 07:27:13.453474 543705 event_worker.go:152] Polling the log file for events...
I0320 07:27:13.501864 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3ad1acc5-248e-4b0b-85da-9d79f45b098d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 07:27:13.501897 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 07:27:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:27:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0320 07:27:14.455170 543705 disk_worker.go:728] disk inode is not compliant
E0320 07:27:14.455836 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 07:27:14.455845 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 07:27:14.455850 543705 custom_config.go:64] query custom config with name: gpu
I0320 07:27:14.456649 543705 disk_worker.go:494] system disk:vda1
I0320 07:27:14.456677 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 07:27:15.456842 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 07:27:15.456850 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:27:16.457934 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 07:27:16.457934 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 07:27:16.457990 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:27:16.458009 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:27:16.472336 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:27:23.233674 543705 disk_info.go:125] begin check local disk info of client
I0320 07:27:23.236062 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:27:23.236068 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b3c40 0xc0002b3c80]
E0320 07:27:23.407508 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:27:23.407520 543705 memory.go:184] no items to output this cycle
I0320 07:27:23.407552 543705 cpu.go:275] no items to output this cycle
E0320 07:27:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:27:33.409780 543705 memory.go:184] no items to output this cycle
I0320 07:27:33.409807 543705 cpu.go:275] no items to output this cycle
I0320 07:27:38.249737 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 07:27:38.249743 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 07:27:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:27:43.410611 543705 memory.go:191] Add success.
I0320 07:27:43.409813 543705 cpu.go:282] Add success.
I0320 07:27:43.420383 543705 net.go:648] Add success.
I0320 07:27:43.423025 543705 net.go:770] primary dev: ETH0
I0320 07:27:43.423040 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:27:43.423053 543705 net.go:698] Add success.
I0320 07:27:46.458117 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:27:46.458184 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:27:46.458212 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:27:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:27:53.409776 543705 memory.go:184] no items to output this cycle
I0320 07:27:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 07:28:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:28:03.409765 543705 memory.go:184] no items to output this cycle
I0320 07:28:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 07:28:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:28:13.409834 543705 memory.go:191] Add success.
I0320 07:28:13.409846 543705 cpu.go:282] Add success.
W0320 07:28:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:28:13.409882 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:28:13.409886 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:28:13.420165 543705 net.go:648] Add success.
I0320 07:28:13.422771 543705 net.go:770] primary dev: ETH0
I0320 07:28:13.422783 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:28:13.422795 543705 net.go:698] Add success.
I0320 07:28:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:28:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:28:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 07:28:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:28:14.456592 543705 disk_worker.go:494] system disk:vda1
I0320 07:28:14.456622 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:28:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:28:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:28:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:28:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:28:16.472440 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:28:23.237672 543705 disk_info.go:125] begin check local disk info of client
I0320 07:28:23.240227 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:28:23.240234 543705 disk_info.go:196] parse disk info done, disk is : [0xc00057db80 0xc00057dbc0]
E0320 07:28:23.408189 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:28:23.408201 543705 memory.go:184] no items to output this cycle
I0320 07:28:23.408240 543705 cpu.go:275] no items to output this cycle
E0320 07:28:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:28:33.409787 543705 cpu.go:275] no items to output this cycle
I0320 07:28:33.409790 543705 memory.go:184] no items to output this cycle
E0320 07:28:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:28:43.409795 543705 memory.go:191] Add success.
I0320 07:28:43.409799 543705 cpu.go:282] Add success.
I0320 07:28:43.419952 543705 net.go:648] Add success.
I0320 07:28:43.422888 543705 net.go:770] primary dev: ETH0
I0320 07:28:43.422903 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:28:43.422917 543705 net.go:698] Add success.
I0320 07:28:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:28:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:28:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:28:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:28:53.409770 543705 memory.go:184] no items to output this cycle
I0320 07:28:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 07:29:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:29:03.409775 543705 memory.go:184] no items to output this cycle
I0320 07:29:03.409782 543705 cpu.go:275] no items to output this cycle
E0320 07:29:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:29:13.409827 543705 memory.go:191] Add success.
I0320 07:29:13.409833 543705 cpu.go:282] Add success.
W0320 07:29:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:29:13.409877 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:29:13.409881 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:29:13.419977 543705 net.go:770] primary dev: ETH0
I0320 07:29:13.419991 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:29:13.420004 543705 net.go:698] Add success.
I0320 07:29:13.420365 543705 net.go:648] Add success.
I0320 07:29:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:29:14.455205 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:29:14.455216 543705 disk_worker.go:708] disk space is not compliant
W0320 07:29:14.455218 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:29:14.456602 543705 disk_worker.go:494] system disk:vda1
I0320 07:29:14.456630 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:29:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:29:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:29:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:29:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:29:16.472375 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:29:23.241674 543705 disk_info.go:125] begin check local disk info of client
I0320 07:29:23.244157 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:29:23.244163 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001c60c0 0xc0001c6140]
E0320 07:29:23.407508 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:29:23.407522 543705 memory.go:184] no items to output this cycle
I0320 07:29:23.407525 543705 cpu.go:275] no items to output this cycle
E0320 07:29:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:29:33.409808 543705 memory.go:184] no items to output this cycle
I0320 07:29:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 07:29:43.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:29:43.409775 543705 memory.go:191] Add success.
I0320 07:29:43.409802 543705 cpu.go:282] Add success.
I0320 07:29:43.420048 543705 net.go:648] Add success.
I0320 07:29:43.422898 543705 net.go:770] primary dev: ETH0
I0320 07:29:43.422912 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:29:43.422929 543705 net.go:698] Add success.
I0320 07:29:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:29:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:29:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:29:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:29:53.409780 543705 memory.go:184] no items to output this cycle
I0320 07:29:53.409780 543705 cpu.go:275] no items to output this cycle
E0320 07:30:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:30:03.409773 543705 memory.go:184] no items to output this cycle
I0320 07:30:03.409805 543705 cpu.go:275] no items to output this cycle
I0320 07:30:13.409967 543705 cpu.go:282] Add success.
E0320 07:30:13.409915 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:30:13.410043 543705 memory.go:191] Add success.
W0320 07:30:13.410073 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:30:13.410085 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:30:13.410088 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:30:13.419734 543705 net.go:648] Add success.
I0320 07:30:13.422578 543705 net.go:770] primary dev: ETH0
I0320 07:30:13.422592 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:30:13.422606 543705 net.go:698] Add success.
I0320 07:30:13.468087 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"66c00345-7d33-4122-987c-8e0fa3c29def","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 07:30:13.468117 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 07:30:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:30:14.455211 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:30:14.455221 543705 disk_worker.go:708] disk space is not compliant
W0320 07:30:14.455223 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:30:14.456628 543705 disk_worker.go:494] system disk:vda1
I0320 07:30:14.456657 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:30:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:30:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:30:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:30:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:30:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:30:23.246330 543705 disk_info.go:125] begin check local disk info of client
I0320 07:30:23.248749 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:30:23.248756 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b740 0xc00007b780]
E0320 07:30:23.407505 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:30:23.407518 543705 memory.go:184] no items to output this cycle
I0320 07:30:23.407529 543705 cpu.go:275] no items to output this cycle
E0320 07:30:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:30:33.409805 543705 memory.go:184] no items to output this cycle
I0320 07:30:33.409817 543705 cpu.go:275] no items to output this cycle
I0320 07:30:38.253738 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 07:30:38.253745 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 07:30:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:30:43.410736 543705 memory.go:191] Add success.
I0320 07:30:43.409903 543705 cpu.go:282] Add success.
I0320 07:30:43.420474 543705 net.go:648] Add success.
I0320 07:30:43.423010 543705 net.go:770] primary dev: ETH0
I0320 07:30:43.423024 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:30:43.423037 543705 net.go:698] Add success.
I0320 07:30:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:30:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:30:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:30:53.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:30:53.409763 543705 memory.go:184] no items to output this cycle
I0320 07:30:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 07:31:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:31:03.409766 543705 memory.go:184] no items to output this cycle
I0320 07:31:03.409807 543705 cpu.go:275] no items to output this cycle
W0320 07:31:13.409712 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:31:13.409731 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:31:13.409737 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:31:13.409899 543705 cpu.go:282] Add success.
E0320 07:31:13.410039 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:31:13.410068 543705 memory.go:191] Add success.
I0320 07:31:13.419758 543705 net.go:648] Add success.
I0320 07:31:13.422665 543705 net.go:770] primary dev: ETH0
I0320 07:31:13.422678 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:31:13.422690 543705 net.go:698] Add success.
I0320 07:31:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:31:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:31:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 07:31:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:31:14.456613 543705 disk_worker.go:494] system disk:vda1
I0320 07:31:14.456642 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:31:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:31:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:31:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:31:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:31:16.472372 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:31:23.249673 543705 disk_info.go:125] begin check local disk info of client
I0320 07:31:23.252111 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:31:23.252117 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b4c0 0xc00007b500]
E0320 07:31:23.407522 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:31:23.407538 543705 memory.go:184] no items to output this cycle
I0320 07:31:23.407551 543705 cpu.go:275] no items to output this cycle
E0320 07:31:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:31:33.409787 543705 memory.go:184] no items to output this cycle
I0320 07:31:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 07:31:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:31:43.409818 543705 memory.go:191] Add success.
I0320 07:31:43.409824 543705 cpu.go:282] Add success.
I0320 07:31:43.419979 543705 net.go:648] Add success.
I0320 07:31:43.422755 543705 net.go:770] primary dev: ETH0
I0320 07:31:43.422769 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:31:43.422781 543705 net.go:698] Add success.
I0320 07:31:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:31:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:31:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:31:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:31:53.409781 543705 memory.go:184] no items to output this cycle
I0320 07:31:53.409783 543705 cpu.go:275] no items to output this cycle
E0320 07:32:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:32:03.409777 543705 memory.go:184] no items to output this cycle
I0320 07:32:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 07:32:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:32:13.409799 543705 memory.go:191] Add success.
I0320 07:32:13.409801 543705 cpu.go:282] Add success.
W0320 07:32:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:32:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:32:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:32:13.420179 543705 net.go:648] Add success.
I0320 07:32:13.423384 543705 net.go:770] primary dev: ETH0
I0320 07:32:13.423397 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:32:13.423416 543705 net.go:698] Add success.
W0320 07:32:14.455166 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:32:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0320 07:32:14.455178 543705 disk_worker.go:728] disk inode is not compliant
E0320 07:32:14.455960 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 07:32:14.455969 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 07:32:14.455975 543705 custom_config.go:64] query custom config with name: gpu
I0320 07:32:14.456556 543705 disk_worker.go:494] system disk:vda1
I0320 07:32:14.456587 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 07:32:15.456795 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 07:32:15.456803 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:32:16.457944 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 07:32:16.457944 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 07:32:16.457998 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:32:16.458017 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:32:16.472350 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:32:23.253673 543705 disk_info.go:125] begin check local disk info of client
I0320 07:32:23.256055 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:32:23.256061 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007af80 0xc00007afc0]
E0320 07:32:23.407965 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:32:23.407981 543705 memory.go:184] no items to output this cycle
I0320 07:32:23.407993 543705 cpu.go:275] no items to output this cycle
E0320 07:32:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:32:33.409780 543705 memory.go:184] no items to output this cycle
I0320 07:32:33.409785 543705 cpu.go:275] no items to output this cycle
E0320 07:32:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:32:43.409796 543705 memory.go:191] Add success.
I0320 07:32:43.409798 543705 cpu.go:282] Add success.
I0320 07:32:43.419883 543705 net.go:648] Add success.
I0320 07:32:43.422920 543705 net.go:770] primary dev: ETH0
I0320 07:32:43.422935 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:32:43.422949 543705 net.go:698] Add success.
I0320 07:32:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:32:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:32:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:32:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:32:53.409782 543705 memory.go:184] no items to output this cycle
I0320 07:32:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 07:33:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:33:03.409784 543705 memory.go:184] no items to output this cycle
I0320 07:33:03.409782 543705 cpu.go:275] no items to output this cycle
E0320 07:33:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:33:13.409806 543705 memory.go:191] Add success.
I0320 07:33:13.409810 543705 cpu.go:282] Add success.
W0320 07:33:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:33:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:33:13.409851 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:33:13.420385 543705 net.go:648] Add success.
I0320 07:33:13.423173 543705 net.go:770] primary dev: ETH0
I0320 07:33:13.423187 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:33:13.423198 543705 net.go:698] Add success.
I0320 07:33:13.467648 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1162f6c8-b041-4bc5-9732-4f98a383f2e3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 07:33:13.467680 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 07:33:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:33:14.455174 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:33:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 07:33:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:33:14.456686 543705 disk_worker.go:494] system disk:vda1
I0320 07:33:14.456714 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:33:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:33:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:33:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:33:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:33:16.472375 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:33:23.257673 543705 disk_info.go:125] begin check local disk info of client
I0320 07:33:23.260076 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:33:23.260083 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035be40 0xc00035be80]
E0320 07:33:23.407969 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:33:23.407986 543705 memory.go:184] no items to output this cycle
I0320 07:33:23.408000 543705 cpu.go:275] no items to output this cycle
E0320 07:33:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:33:33.409815 543705 memory.go:184] no items to output this cycle
I0320 07:33:33.409827 543705 cpu.go:275] no items to output this cycle
I0320 07:33:38.257735 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 07:33:38.257741 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 07:33:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:33:43.410709 543705 memory.go:191] Add success.
I0320 07:33:43.409808 543705 cpu.go:282] Add success.
I0320 07:33:43.420416 543705 net.go:648] Add success.
I0320 07:33:43.423422 543705 net.go:770] primary dev: ETH0
I0320 07:33:43.423436 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:33:43.423450 543705 net.go:698] Add success.
I0320 07:33:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:33:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:33:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:33:53.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:33:53.409763 543705 memory.go:184] no items to output this cycle
I0320 07:33:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 07:34:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:34:03.409776 543705 memory.go:184] no items to output this cycle
I0320 07:34:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 07:34:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:34:13.409785 543705 memory.go:191] Add success.
W0320 07:34:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 07:34:13.409823 543705 cpu.go:282] Add success.
W0320 07:34:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:34:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:34:13.420144 543705 net.go:648] Add success.
I0320 07:34:13.423085 543705 net.go:770] primary dev: ETH0
I0320 07:34:13.423097 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:34:13.423299 543705 net.go:698] Add success.
I0320 07:34:14.454950 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:34:14.455164 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:34:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0320 07:34:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:34:14.456566 543705 disk_worker.go:494] system disk:vda1
I0320 07:34:14.456595 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:34:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:34:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:34:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:34:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:34:16.472368 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:34:23.261672 543705 disk_info.go:125] begin check local disk info of client
I0320 07:34:23.264174 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:34:23.264181 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004af380 0xc0004af3c0]
E0320 07:34:23.408015 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:34:23.408028 543705 memory.go:184] no items to output this cycle
I0320 07:34:23.408059 543705 cpu.go:275] no items to output this cycle
E0320 07:34:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:34:33.409800 543705 memory.go:184] no items to output this cycle
I0320 07:34:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 07:34:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:34:43.409787 543705 memory.go:191] Add success.
I0320 07:34:43.409814 543705 cpu.go:282] Add success.
I0320 07:34:43.419894 543705 net.go:648] Add success.
I0320 07:34:43.422703 543705 net.go:770] primary dev: ETH0
I0320 07:34:43.422717 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:34:43.422731 543705 net.go:698] Add success.
I0320 07:34:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:34:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:34:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:34:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:34:53.409767 543705 memory.go:184] no items to output this cycle
I0320 07:34:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 07:35:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:35:03.409775 543705 memory.go:184] no items to output this cycle
I0320 07:35:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 07:35:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:35:13.409787 543705 memory.go:191] Add success.
W0320 07:35:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:35:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:35:13.409826 543705 cpu.go:282] Add success.
I0320 07:35:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:35:13.420122 543705 net.go:648] Add success.
I0320 07:35:13.422949 543705 net.go:770] primary dev: ETH0
I0320 07:35:13.422962 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:35:13.423126 543705 net.go:698] Add success.
I0320 07:35:14.454862 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:35:14.455070 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:35:14.455081 543705 disk_worker.go:708] disk space is not compliant
W0320 07:35:14.455083 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:35:14.456471 543705 disk_worker.go:494] system disk:vda1
I0320 07:35:14.456499 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:35:15.455974 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:35:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:35:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:35:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:35:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:35:23.265698 543705 disk_info.go:125] begin check local disk info of client
I0320 07:35:23.268159 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:35:23.268165 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004ae1c0 0xc0004ae200]
E0320 07:35:23.408014 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:35:23.408031 543705 memory.go:184] no items to output this cycle
I0320 07:35:23.408043 543705 cpu.go:275] no items to output this cycle
E0320 07:35:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:35:33.409787 543705 memory.go:184] no items to output this cycle
I0320 07:35:33.409794 543705 cpu.go:275] no items to output this cycle
E0320 07:35:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:35:43.409789 543705 memory.go:191] Add success.
I0320 07:35:43.409820 543705 cpu.go:282] Add success.
I0320 07:35:43.419866 543705 net.go:648] Add success.
I0320 07:35:43.422567 543705 net.go:770] primary dev: ETH0
I0320 07:35:43.422582 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:35:43.422597 543705 net.go:698] Add success.
I0320 07:35:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:35:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:35:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:35:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:35:53.409790 543705 memory.go:184] no items to output this cycle
I0320 07:35:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 07:36:03.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:36:03.409812 543705 memory.go:184] no items to output this cycle
I0320 07:36:03.409824 543705 cpu.go:275] no items to output this cycle
E0320 07:36:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:36:13.409809 543705 memory.go:191] Add success.
I0320 07:36:13.409812 543705 cpu.go:282] Add success.
W0320 07:36:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:36:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:36:13.409853 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:36:13.420143 543705 net.go:648] Add success.
I0320 07:36:13.422756 543705 net.go:770] primary dev: ETH0
I0320 07:36:13.422769 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:36:13.422780 543705 net.go:698] Add success.
I0320 07:36:13.504771 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1fcbfe61-1564-4081-bde0-89dcb56c99b8","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 07:36:13.504802 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 07:36:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:36:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:36:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 07:36:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:36:14.456667 543705 disk_worker.go:494] system disk:vda1
I0320 07:36:14.456706 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:36:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:36:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:36:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:36:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:36:16.472381 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:36:23.269675 543705 disk_info.go:125] begin check local disk info of client
I0320 07:36:23.272088 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:36:23.272094 543705 disk_info.go:196] parse disk info done, disk is : [0xc000538600 0xc000538700]
E0320 07:36:23.407907 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:36:23.407922 543705 memory.go:184] no items to output this cycle
I0320 07:36:23.407936 543705 cpu.go:275] no items to output this cycle
E0320 07:36:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:36:33.409797 543705 memory.go:184] no items to output this cycle
I0320 07:36:33.409801 543705 cpu.go:275] no items to output this cycle
I0320 07:36:38.261759 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 07:36:38.261767 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 07:36:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:36:43.410759 543705 memory.go:191] Add success.
I0320 07:36:43.409834 543705 cpu.go:282] Add success.
I0320 07:36:43.420470 543705 net.go:648] Add success.
I0320 07:36:43.423226 543705 net.go:770] primary dev: ETH0
I0320 07:36:43.423241 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:36:43.423256 543705 net.go:698] Add success.
I0320 07:36:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:36:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:36:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:36:53.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:36:53.409815 543705 memory.go:184] no items to output this cycle
I0320 07:36:53.409824 543705 cpu.go:275] no items to output this cycle
E0320 07:37:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:37:03.409809 543705 memory.go:184] no items to output this cycle
I0320 07:37:03.409823 543705 cpu.go:275] no items to output this cycle
E0320 07:37:13.409886 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:37:13.409913 543705 memory.go:191] Add success.
I0320 07:37:13.409914 543705 cpu.go:282] Add success.
W0320 07:37:13.409947 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:37:13.409972 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:37:13.409976 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:37:13.419729 543705 net.go:648] Add success.
I0320 07:37:13.422637 543705 net.go:770] primary dev: ETH0
I0320 07:37:13.422651 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:37:13.422662 543705 net.go:698] Add success.
I0320 07:37:13.453279 543705 event_worker.go:152] Polling the log file for events...
W0320 07:37:14.455149 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:37:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0320 07:37:14.455165 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:37:14.456788 543705 disk_worker.go:494] system disk:vda1
I0320 07:37:14.456825 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 07:37:14.457128 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 07:37:14.457135 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 07:37:14.457140 543705 custom_config.go:64] query custom config with name: gpu
E0320 07:37:15.456858 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 07:37:15.456867 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:37:16.457926 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 07:37:16.457927 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 07:37:16.457979 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:37:16.457998 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:37:16.472321 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:37:23.273675 543705 disk_info.go:125] begin check local disk info of client
I0320 07:37:23.276081 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:37:23.276087 543705 disk_info.go:196] parse disk info done, disk is : [0xc000538340 0xc000538380]
E0320 07:37:23.407886 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:37:23.407902 543705 memory.go:184] no items to output this cycle
I0320 07:37:23.407914 543705 cpu.go:275] no items to output this cycle
E0320 07:37:33.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:37:33.409822 543705 memory.go:184] no items to output this cycle
I0320 07:37:33.409836 543705 cpu.go:275] no items to output this cycle
E0320 07:37:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:37:43.409775 543705 memory.go:191] Add success.
I0320 07:37:43.409816 543705 cpu.go:282] Add success.
I0320 07:37:43.419864 543705 net.go:648] Add success.
I0320 07:37:43.422820 543705 net.go:770] primary dev: ETH0
I0320 07:37:43.422834 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:37:43.422847 543705 net.go:698] Add success.
I0320 07:37:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:37:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:37:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:37:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:37:53.409776 543705 memory.go:184] no items to output this cycle
I0320 07:37:53.409780 543705 cpu.go:275] no items to output this cycle
E0320 07:38:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:38:03.409791 543705 memory.go:184] no items to output this cycle
I0320 07:38:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 07:38:13.409899 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:38:13.409926 543705 memory.go:191] Add success.
I0320 07:38:13.409928 543705 cpu.go:282] Add success.
W0320 07:38:13.409963 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:38:13.409989 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:38:13.409992 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:38:13.419711 543705 net.go:648] Add success.
I0320 07:38:13.422524 543705 net.go:770] primary dev: ETH0
I0320 07:38:13.422536 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:38:13.422547 543705 net.go:698] Add success.
I0320 07:38:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:38:14.455105 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:38:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 07:38:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:38:14.456506 543705 disk_worker.go:494] system disk:vda1
I0320 07:38:14.456550 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:38:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:38:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:38:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:38:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:38:16.472379 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:38:23.277675 543705 disk_info.go:125] begin check local disk info of client
I0320 07:38:23.280158 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:38:23.280165 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa400 0xc0001aa480]
E0320 07:38:23.407923 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:38:23.407935 543705 memory.go:184] no items to output this cycle
I0320 07:38:23.407978 543705 cpu.go:275] no items to output this cycle
E0320 07:38:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:38:33.409797 543705 memory.go:184] no items to output this cycle
I0320 07:38:33.409798 543705 cpu.go:275] no items to output this cycle
E0320 07:38:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:38:43.409795 543705 memory.go:191] Add success.
I0320 07:38:43.409796 543705 cpu.go:282] Add success.
I0320 07:38:43.419867 543705 net.go:648] Add success.
I0320 07:38:43.422802 543705 net.go:770] primary dev: ETH0
I0320 07:38:43.422819 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:38:43.422834 543705 net.go:698] Add success.
I0320 07:38:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:38:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:38:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:38:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:38:53.409778 543705 cpu.go:275] no items to output this cycle
I0320 07:38:53.409780 543705 memory.go:184] no items to output this cycle
E0320 07:39:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:39:03.409798 543705 memory.go:184] no items to output this cycle
I0320 07:39:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 07:39:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:39:13.409923 543705 memory.go:191] Add success.
I0320 07:39:13.409949 543705 cpu.go:282] Add success.
W0320 07:39:13.409960 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:39:13.409975 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:39:13.409978 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:39:13.419749 543705 net.go:648] Add success.
I0320 07:39:13.422694 543705 net.go:770] primary dev: ETH0
I0320 07:39:13.422707 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:39:13.422719 543705 net.go:698] Add success.
I0320 07:39:13.470289 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b0e4fca8-0e01-474d-a3b0-e34a1501364b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 07:39:13.470327 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 07:39:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:39:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:39:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 07:39:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:39:14.456672 543705 disk_worker.go:494] system disk:vda1
I0320 07:39:14.456702 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:39:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:39:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:39:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:39:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:39:16.472441 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:39:23.281673 543705 disk_info.go:125] begin check local disk info of client
I0320 07:39:23.284102 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:39:23.284108 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e9c80 0xc0003e9cc0]
E0320 07:39:23.407534 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:39:23.407549 543705 memory.go:184] no items to output this cycle
I0320 07:39:23.407565 543705 cpu.go:275] no items to output this cycle
E0320 07:39:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:39:33.409787 543705 memory.go:184] no items to output this cycle
I0320 07:39:33.409789 543705 cpu.go:275] no items to output this cycle
I0320 07:39:38.265758 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 07:39:38.265765 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 07:39:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:39:43.410619 543705 memory.go:191] Add success.
I0320 07:39:43.409801 543705 cpu.go:282] Add success.
I0320 07:39:43.420330 543705 net.go:648] Add success.
I0320 07:39:43.422987 543705 net.go:770] primary dev: ETH0
I0320 07:39:43.423001 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:39:43.423017 543705 net.go:698] Add success.
I0320 07:39:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:39:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:39:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:39:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:39:53.409788 543705 memory.go:184] no items to output this cycle
I0320 07:39:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 07:40:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:40:03.409806 543705 memory.go:184] no items to output this cycle
I0320 07:40:03.409819 543705 cpu.go:275] no items to output this cycle
E0320 07:40:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:40:13.409784 543705 memory.go:191] Add success.
I0320 07:40:13.409808 543705 cpu.go:282] Add success.
W0320 07:40:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:40:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:40:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:40:13.419712 543705 net.go:648] Add success.
I0320 07:40:13.422931 543705 net.go:770] primary dev: ETH0
I0320 07:40:13.422943 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:40:13.422955 543705 net.go:698] Add success.
I0320 07:40:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:40:14.455204 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:40:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0320 07:40:14.455217 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:40:14.456867 543705 disk_worker.go:494] system disk:vda1
I0320 07:40:14.456896 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:40:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:40:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:40:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:40:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:40:16.472390 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:40:23.285671 543705 disk_info.go:125] begin check local disk info of client
I0320 07:40:23.288110 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:40:23.288116 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001f7000 0xc0001f7040]
E0320 07:40:23.407513 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:40:23.407526 543705 memory.go:184] no items to output this cycle
I0320 07:40:23.407539 543705 cpu.go:275] no items to output this cycle
E0320 07:40:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:40:33.409783 543705 cpu.go:275] no items to output this cycle
I0320 07:40:33.409787 543705 memory.go:184] no items to output this cycle
E0320 07:40:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:40:43.409826 543705 memory.go:191] Add success.
I0320 07:40:43.409829 543705 cpu.go:282] Add success.
I0320 07:40:43.419889 543705 net.go:648] Add success.
I0320 07:40:43.422881 543705 net.go:770] primary dev: ETH0
I0320 07:40:43.422893 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:40:43.422906 543705 net.go:698] Add success.
I0320 07:40:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:40:46.458062 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:40:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:40:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:40:53.409777 543705 memory.go:184] no items to output this cycle
I0320 07:40:53.409778 543705 cpu.go:275] no items to output this cycle
E0320 07:41:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:41:03.409768 543705 memory.go:184] no items to output this cycle
I0320 07:41:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 07:41:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:41:13.409790 543705 memory.go:191] Add success.
W0320 07:41:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 07:41:13.409820 543705 cpu.go:282] Add success.
W0320 07:41:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:41:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:41:13.420078 543705 net.go:648] Add success.
I0320 07:41:13.423189 543705 net.go:770] primary dev: ETH0
I0320 07:41:13.423204 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:41:13.423218 543705 net.go:698] Add success.
I0320 07:41:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:41:14.455315 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:41:14.455401 543705 disk_worker.go:708] disk space is not compliant
W0320 07:41:14.455406 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:41:14.457036 543705 disk_worker.go:494] system disk:vda1
I0320 07:41:14.457077 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:41:15.455950 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:41:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:41:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:41:16.458077 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:41:16.472422 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:41:23.289672 543705 disk_info.go:125] begin check local disk info of client
I0320 07:41:23.292143 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:41:23.292149 543705 disk_info.go:196] parse disk info done, disk is : [0xc00046c280 0xc00046c2c0]
E0320 07:41:23.407513 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:41:23.407524 543705 memory.go:184] no items to output this cycle
I0320 07:41:23.407529 543705 cpu.go:275] no items to output this cycle
E0320 07:41:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:41:33.409783 543705 memory.go:184] no items to output this cycle
I0320 07:41:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 07:41:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:41:43.409814 543705 memory.go:191] Add success.
I0320 07:41:43.409819 543705 cpu.go:282] Add success.
I0320 07:41:43.420049 543705 net.go:648] Add success.
I0320 07:41:43.423149 543705 net.go:770] primary dev: ETH0
I0320 07:41:43.423162 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:41:43.423175 543705 net.go:698] Add success.
I0320 07:41:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:41:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:41:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:41:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:41:53.409776 543705 cpu.go:275] no items to output this cycle
I0320 07:41:53.409787 543705 memory.go:184] no items to output this cycle
E0320 07:42:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:42:03.409791 543705 memory.go:184] no items to output this cycle
I0320 07:42:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 07:42:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:42:13.409816 543705 memory.go:191] Add success.
I0320 07:42:13.409824 543705 cpu.go:282] Add success.
W0320 07:42:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:42:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:42:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:42:13.420141 543705 net.go:648] Add success.
I0320 07:42:13.423066 543705 net.go:770] primary dev: ETH0
I0320 07:42:13.423081 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:42:13.423095 543705 net.go:698] Add success.
I0320 07:42:13.481918 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1a5aaf72-ac73-443e-b902-43f1c9dbd413","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 07:42:13.481951 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 07:42:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:42:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 07:42:14.455192 543705 disk_worker.go:728] disk inode is not compliant
E0320 07:42:14.456151 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 07:42:14.456161 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 07:42:14.456168 543705 custom_config.go:64] query custom config with name: gpu
I0320 07:42:14.457626 543705 disk_worker.go:494] system disk:vda1
I0320 07:42:14.457681 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 07:42:15.456858 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 07:42:15.456867 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:42:16.457902 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 07:42:16.457901 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 07:42:16.457955 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:42:16.457974 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:42:16.472301 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:42:23.293676 543705 disk_info.go:125] begin check local disk info of client
I0320 07:42:23.296112 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:42:23.296119 543705 disk_info.go:196] parse disk info done, disk is : [0xc000250300 0xc000250340]
E0320 07:42:23.407815 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:42:23.407827 543705 memory.go:184] no items to output this cycle
I0320 07:42:23.407829 543705 cpu.go:275] no items to output this cycle
E0320 07:42:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:42:33.409803 543705 memory.go:184] no items to output this cycle
I0320 07:42:33.409814 543705 cpu.go:275] no items to output this cycle
I0320 07:42:38.269732 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 07:42:38.269739 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 07:42:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:42:43.410737 543705 memory.go:191] Add success.
I0320 07:42:43.409790 543705 cpu.go:282] Add success.
I0320 07:42:43.420450 543705 net.go:648] Add success.
I0320 07:42:43.423410 543705 net.go:770] primary dev: ETH0
I0320 07:42:43.423424 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:42:43.423439 543705 net.go:698] Add success.
I0320 07:42:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:42:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:42:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:42:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:42:53.409777 543705 memory.go:184] no items to output this cycle
I0320 07:42:53.409778 543705 cpu.go:275] no items to output this cycle
E0320 07:43:03.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:43:03.409764 543705 memory.go:184] no items to output this cycle
I0320 07:43:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 07:43:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:43:13.409797 543705 memory.go:191] Add success.
I0320 07:43:13.409803 543705 cpu.go:282] Add success.
W0320 07:43:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:43:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:43:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:43:13.420056 543705 net.go:648] Add success.
I0320 07:43:13.422665 543705 net.go:770] primary dev: ETH0
I0320 07:43:13.422680 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:43:13.422695 543705 net.go:698] Add success.
I0320 07:43:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:43:14.455194 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:43:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 07:43:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:43:14.457028 543705 disk_worker.go:494] system disk:vda1
I0320 07:43:14.457066 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:43:15.456013 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:43:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:43:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:43:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:43:16.472374 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:43:23.297673 543705 disk_info.go:125] begin check local disk info of client
I0320 07:43:23.300102 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:43:23.300108 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003f3480 0xc0003f34c0]
E0320 07:43:23.407788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:43:23.407806 543705 memory.go:184] no items to output this cycle
I0320 07:43:23.407818 543705 cpu.go:275] no items to output this cycle
E0320 07:43:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:43:33.409802 543705 memory.go:184] no items to output this cycle
I0320 07:43:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 07:43:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:43:43.409791 543705 memory.go:191] Add success.
I0320 07:43:43.409791 543705 cpu.go:282] Add success.
I0320 07:43:43.419880 543705 net.go:648] Add success.
I0320 07:43:43.422574 543705 net.go:770] primary dev: ETH0
I0320 07:43:43.422588 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:43:43.422600 543705 net.go:698] Add success.
I0320 07:43:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:43:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:43:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:43:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:43:53.409794 543705 memory.go:184] no items to output this cycle
I0320 07:43:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 07:44:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:44:03.409793 543705 memory.go:184] no items to output this cycle
I0320 07:44:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 07:44:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:44:13.409783 543705 memory.go:191] Add success.
I0320 07:44:13.409810 543705 cpu.go:282] Add success.
W0320 07:44:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:44:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:44:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:44:13.420132 543705 net.go:648] Add success.
I0320 07:44:13.422550 543705 net.go:770] primary dev: ETH0
I0320 07:44:13.422564 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:44:13.422578 543705 net.go:698] Add success.
I0320 07:44:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:44:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:44:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 07:44:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:44:14.456561 543705 disk_worker.go:494] system disk:vda1
I0320 07:44:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:44:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:44:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:44:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:44:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:44:16.472414 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:44:23.301676 543705 disk_info.go:125] begin check local disk info of client
I0320 07:44:23.304101 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:44:23.304107 543705 disk_info.go:196] parse disk info done, disk is : [0xc000576940 0xc000576980]
E0320 07:44:23.407764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:44:23.407780 543705 memory.go:184] no items to output this cycle
I0320 07:44:23.407794 543705 cpu.go:275] no items to output this cycle
E0320 07:44:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:44:33.409798 543705 memory.go:184] no items to output this cycle
I0320 07:44:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 07:44:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:44:43.409818 543705 memory.go:191] Add success.
I0320 07:44:43.409824 543705 cpu.go:282] Add success.
I0320 07:44:43.419968 543705 net.go:648] Add success.
I0320 07:44:43.422495 543705 net.go:770] primary dev: ETH0
I0320 07:44:43.422511 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:44:43.422525 543705 net.go:698] Add success.
I0320 07:44:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:44:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:44:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:44:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:44:53.409777 543705 memory.go:184] no items to output this cycle
I0320 07:44:53.409779 543705 cpu.go:275] no items to output this cycle
E0320 07:45:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:45:03.409790 543705 memory.go:184] no items to output this cycle
I0320 07:45:03.409802 543705 cpu.go:275] no items to output this cycle
E0320 07:45:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:45:13.409800 543705 cpu.go:282] Add success.
I0320 07:45:13.409803 543705 memory.go:191] Add success.
W0320 07:45:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:45:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:45:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:45:13.420143 543705 net.go:648] Add success.
I0320 07:45:13.423140 543705 net.go:770] primary dev: ETH0
I0320 07:45:13.423153 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:45:13.423165 543705 net.go:698] Add success.
I0320 07:45:13.573736 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"05171ae8-d8eb-432c-9a3f-bb8d79955521","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 07:45:13.573776 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 07:45:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:45:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:45:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 07:45:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:45:14.456732 543705 disk_worker.go:494] system disk:vda1
I0320 07:45:14.456769 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:45:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:45:16.457581 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:45:16.457642 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:45:16.457687 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:45:16.473045 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:45:23.305675 543705 disk_info.go:125] begin check local disk info of client
I0320 07:45:23.308130 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:45:23.308136 543705 disk_info.go:196] parse disk info done, disk is : [0xc00053a580 0xc00053a5c0]
E0320 07:45:23.407786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:45:23.407801 543705 memory.go:184] no items to output this cycle
I0320 07:45:23.407816 543705 cpu.go:275] no items to output this cycle
E0320 07:45:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:45:33.409785 543705 memory.go:184] no items to output this cycle
I0320 07:45:33.409786 543705 cpu.go:275] no items to output this cycle
I0320 07:45:38.273732 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 07:45:38.273740 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 07:45:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:45:43.410699 543705 memory.go:191] Add success.
I0320 07:45:43.409819 543705 cpu.go:282] Add success.
I0320 07:45:43.420460 543705 net.go:648] Add success.
I0320 07:45:43.423032 543705 net.go:770] primary dev: ETH0
I0320 07:45:43.423045 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:45:43.423058 543705 net.go:698] Add success.
I0320 07:45:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:45:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:45:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:45:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:45:53.409779 543705 memory.go:184] no items to output this cycle
I0320 07:45:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 07:46:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:46:03.409799 543705 memory.go:184] no items to output this cycle
I0320 07:46:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 07:46:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:46:13.409828 543705 memory.go:191] Add success.
I0320 07:46:13.409834 543705 cpu.go:282] Add success.
W0320 07:46:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:46:13.409877 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:46:13.409881 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:46:13.420180 543705 net.go:648] Add success.
I0320 07:46:13.422838 543705 net.go:770] primary dev: ETH0
I0320 07:46:13.422853 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:46:13.422867 543705 net.go:698] Add success.
I0320 07:46:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:46:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:46:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 07:46:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:46:14.456568 543705 disk_worker.go:494] system disk:vda1
I0320 07:46:14.456600 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:46:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:46:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:46:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:46:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:46:16.472383 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:46:23.309670 543705 disk_info.go:125] begin check local disk info of client
I0320 07:46:23.312096 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:46:23.312102 543705 disk_info.go:196] parse disk info done, disk is : [0xc000322580 0xc0003225c0]
E0320 07:46:23.407840 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:46:23.407860 543705 cpu.go:275] no items to output this cycle
I0320 07:46:23.407862 543705 memory.go:184] no items to output this cycle
E0320 07:46:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:46:33.409789 543705 memory.go:184] no items to output this cycle
I0320 07:46:33.409791 543705 cpu.go:275] no items to output this cycle
E0320 07:46:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:46:43.409796 543705 memory.go:191] Add success.
I0320 07:46:43.409796 543705 cpu.go:282] Add success.
I0320 07:46:43.420026 543705 net.go:648] Add success.
I0320 07:46:43.422530 543705 net.go:770] primary dev: ETH0
I0320 07:46:43.422544 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:46:43.422557 543705 net.go:698] Add success.
I0320 07:46:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:46:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:46:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:46:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:46:53.409768 543705 memory.go:184] no items to output this cycle
I0320 07:46:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 07:47:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:47:03.409780 543705 cpu.go:275] no items to output this cycle
I0320 07:47:03.409786 543705 memory.go:184] no items to output this cycle
E0320 07:47:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:47:13.409828 543705 memory.go:191] Add success.
I0320 07:47:13.409834 543705 cpu.go:282] Add success.
W0320 07:47:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:47:13.409876 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:47:13.409880 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:47:13.420172 543705 net.go:648] Add success.
I0320 07:47:13.422814 543705 net.go:770] primary dev: ETH0
I0320 07:47:13.422829 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:47:13.422843 543705 net.go:698] Add success.
I0320 07:47:13.453518 543705 event_worker.go:152] Polling the log file for events...
W0320 07:47:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:47:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0320 07:47:14.455176 543705 disk_worker.go:728] disk inode is not compliant
E0320 07:47:14.456914 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 07:47:14.456924 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 07:47:14.456929 543705 custom_config.go:64] query custom config with name: gpu
I0320 07:47:14.457000 543705 disk_worker.go:494] system disk:vda1
I0320 07:47:14.457042 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 07:47:15.456806 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 07:47:15.456815 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:47:16.457960 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 07:47:16.457959 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 07:47:16.458013 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:47:16.458032 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:47:16.472367 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:47:23.313680 543705 disk_info.go:125] begin check local disk info of client
I0320 07:47:23.316125 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:47:23.316132 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bd9c0 0xc0002bda00]
E0320 07:47:23.407681 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:47:23.407737 543705 cpu.go:275] no items to output this cycle
I0320 07:47:23.407753 543705 memory.go:184] no items to output this cycle
E0320 07:47:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:47:33.409806 543705 memory.go:184] no items to output this cycle
I0320 07:47:33.409824 543705 cpu.go:275] no items to output this cycle
E0320 07:47:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:47:43.409787 543705 memory.go:191] Add success.
I0320 07:47:43.409802 543705 cpu.go:282] Add success.
I0320 07:47:43.419889 543705 net.go:648] Add success.
I0320 07:47:43.422762 543705 net.go:770] primary dev: ETH0
I0320 07:47:43.422776 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:47:43.422788 543705 net.go:698] Add success.
I0320 07:47:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:47:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:47:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:47:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:47:53.409767 543705 memory.go:184] no items to output this cycle
I0320 07:47:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 07:48:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:48:03.409802 543705 memory.go:184] no items to output this cycle
I0320 07:48:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 07:48:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:48:13.409808 543705 cpu.go:282] Add success.
I0320 07:48:13.409824 543705 memory.go:191] Add success.
W0320 07:48:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:48:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:48:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:48:13.420068 543705 net.go:648] Add success.
I0320 07:48:13.422833 543705 net.go:770] primary dev: ETH0
I0320 07:48:13.422846 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:48:13.422858 543705 net.go:698] Add success.
I0320 07:48:13.470253 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d4fc1324-c85d-446b-97ac-49f4b8638485","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 07:48:13.470288 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 07:48:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:48:14.455134 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:48:14.455211 543705 disk_worker.go:708] disk space is not compliant
W0320 07:48:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:48:14.456695 543705 disk_worker.go:494] system disk:vda1
I0320 07:48:14.456733 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:48:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:48:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:48:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:48:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:48:16.472386 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:48:23.317673 543705 disk_info.go:125] begin check local disk info of client
I0320 07:48:23.320077 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:48:23.320083 543705 disk_info.go:196] parse disk info done, disk is : [0xc000207240 0xc000207280]
E0320 07:48:23.407672 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:48:23.407693 543705 memory.go:184] no items to output this cycle
I0320 07:48:23.407698 543705 cpu.go:275] no items to output this cycle
E0320 07:48:33.409900 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:48:33.409919 543705 memory.go:184] no items to output this cycle
I0320 07:48:33.409995 543705 cpu.go:275] no items to output this cycle
I0320 07:48:38.277738 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 07:48:38.277745 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 07:48:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:48:43.410775 543705 memory.go:191] Add success.
I0320 07:48:43.409808 543705 cpu.go:282] Add success.
I0320 07:48:43.420489 543705 net.go:648] Add success.
I0320 07:48:43.423395 543705 net.go:770] primary dev: ETH0
I0320 07:48:43.423408 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:48:43.423420 543705 net.go:698] Add success.
I0320 07:48:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:48:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:48:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:48:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:48:53.409765 543705 memory.go:184] no items to output this cycle
I0320 07:48:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 07:49:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:49:03.409798 543705 memory.go:184] no items to output this cycle
I0320 07:49:03.409813 543705 cpu.go:275] no items to output this cycle
W0320 07:49:13.409716 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:49:13.409733 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:49:13.409739 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:49:13.409804 543705 cpu.go:282] Add success.
E0320 07:49:13.409813 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:49:13.409831 543705 memory.go:191] Add success.
I0320 07:49:13.420160 543705 net.go:648] Add success.
I0320 07:49:13.423141 543705 net.go:770] primary dev: ETH0
I0320 07:49:13.423155 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:49:13.423166 543705 net.go:698] Add success.
I0320 07:49:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:49:14.455169 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:49:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 07:49:14.455182 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:49:14.456518 543705 disk_worker.go:494] system disk:vda1
I0320 07:49:14.456564 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:49:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:49:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:49:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:49:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:49:16.472391 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:49:23.321675 543705 disk_info.go:125] begin check local disk info of client
I0320 07:49:23.324086 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:49:23.324092 543705 disk_info.go:196] parse disk info done, disk is : [0xc000384b80 0xc000384bc0]
E0320 07:49:23.407660 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:49:23.407676 543705 memory.go:184] no items to output this cycle
I0320 07:49:23.407686 543705 cpu.go:275] no items to output this cycle
E0320 07:49:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:49:33.409786 543705 memory.go:184] no items to output this cycle
I0320 07:49:33.409789 543705 cpu.go:275] no items to output this cycle
E0320 07:49:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:49:43.409787 543705 memory.go:191] Add success.
I0320 07:49:43.409804 543705 cpu.go:282] Add success.
I0320 07:49:43.419953 543705 net.go:648] Add success.
I0320 07:49:43.422761 543705 net.go:770] primary dev: ETH0
I0320 07:49:43.422775 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:49:43.422786 543705 net.go:698] Add success.
I0320 07:49:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:49:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:49:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:49:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:49:53.409771 543705 memory.go:184] no items to output this cycle
I0320 07:49:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 07:50:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:50:03.409812 543705 memory.go:184] no items to output this cycle
I0320 07:50:03.409827 543705 cpu.go:275] no items to output this cycle
W0320 07:50:13.409716 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:50:13.409734 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:50:13.409740 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:50:13.409800 543705 cpu.go:282] Add success.
E0320 07:50:13.409841 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:50:13.409864 543705 memory.go:191] Add success.
I0320 07:50:13.420137 543705 net.go:648] Add success.
I0320 07:50:13.422661 543705 net.go:770] primary dev: ETH0
I0320 07:50:13.422674 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:50:13.422685 543705 net.go:698] Add success.
I0320 07:50:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:50:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:50:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0320 07:50:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:50:14.456593 543705 disk_worker.go:494] system disk:vda1
I0320 07:50:14.456625 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:50:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:50:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:50:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:50:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:50:16.472381 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:50:23.325676 543705 disk_info.go:125] begin check local disk info of client
I0320 07:50:23.328100 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:50:23.328106 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004faf40 0xc0004faf80]
E0320 07:50:23.407656 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:50:23.407671 543705 memory.go:184] no items to output this cycle
I0320 07:50:23.407685 543705 cpu.go:275] no items to output this cycle
E0320 07:50:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:50:33.409782 543705 memory.go:184] no items to output this cycle
I0320 07:50:33.409789 543705 cpu.go:275] no items to output this cycle
E0320 07:50:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:50:43.409788 543705 memory.go:191] Add success.
I0320 07:50:43.409791 543705 cpu.go:282] Add success.
I0320 07:50:43.420071 543705 net.go:648] Add success.
I0320 07:50:43.423265 543705 net.go:770] primary dev: ETH0
I0320 07:50:43.423280 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:50:43.423295 543705 net.go:698] Add success.
I0320 07:50:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:50:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:50:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:50:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:50:53.409778 543705 memory.go:184] no items to output this cycle
I0320 07:50:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 07:51:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:51:03.409793 543705 memory.go:184] no items to output this cycle
I0320 07:51:03.409808 543705 cpu.go:275] no items to output this cycle
E0320 07:51:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:51:13.409801 543705 memory.go:191] Add success.
I0320 07:51:13.409801 543705 cpu.go:282] Add success.
W0320 07:51:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:51:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:51:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:51:13.420546 543705 net.go:648] Add success.
I0320 07:51:13.423368 543705 net.go:770] primary dev: ETH0
I0320 07:51:13.423381 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:51:13.423393 543705 net.go:698] Add success.
I0320 07:51:13.658452 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"76567a90-63c6-482b-b47e-d192089180a8","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 07:51:13.658489 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 07:51:14.453972 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:51:14.455242 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:51:14.455261 543705 disk_worker.go:708] disk space is not compliant
W0320 07:51:14.455263 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:51:14.456637 543705 disk_worker.go:494] system disk:vda1
I0320 07:51:14.456691 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:51:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:51:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:51:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:51:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:51:16.472411 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:51:23.329672 543705 disk_info.go:125] begin check local disk info of client
I0320 07:51:23.332124 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:51:23.332130 543705 disk_info.go:196] parse disk info done, disk is : [0xc00025f1c0 0xc00025f200]
E0320 07:51:23.407612 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:51:23.407624 543705 memory.go:184] no items to output this cycle
I0320 07:51:23.407657 543705 cpu.go:275] no items to output this cycle
E0320 07:51:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:51:33.409809 543705 memory.go:184] no items to output this cycle
I0320 07:51:33.409822 543705 cpu.go:275] no items to output this cycle
I0320 07:51:38.281739 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 07:51:38.281750 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 07:51:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:51:43.410744 543705 memory.go:191] Add success.
I0320 07:51:43.409826 543705 cpu.go:282] Add success.
I0320 07:51:43.420537 543705 net.go:648] Add success.
I0320 07:51:43.423548 543705 net.go:770] primary dev: ETH0
I0320 07:51:43.423561 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:51:43.423573 543705 net.go:698] Add success.
I0320 07:51:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:51:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:51:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:51:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:51:53.409796 543705 memory.go:184] no items to output this cycle
I0320 07:51:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 07:52:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:52:03.409775 543705 memory.go:184] no items to output this cycle
I0320 07:52:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 07:52:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:52:13.409824 543705 memory.go:191] Add success.
I0320 07:52:13.409832 543705 cpu.go:282] Add success.
W0320 07:52:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:52:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:52:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:52:13.420132 543705 net.go:648] Add success.
I0320 07:52:13.422932 543705 net.go:770] primary dev: ETH0
I0320 07:52:13.422946 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:52:13.422961 543705 net.go:698] Add success.
W0320 07:52:14.455124 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:52:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 07:52:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:52:14.456803 543705 disk_worker.go:494] system disk:vda1
I0320 07:52:14.456844 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 07:52:14.457104 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 07:52:14.457111 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 07:52:14.457116 543705 custom_config.go:64] query custom config with name: gpu
E0320 07:52:15.456882 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 07:52:15.456891 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:52:16.457948 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 07:52:16.457956 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 07:52:16.458001 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:52:16.458017 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:52:16.472348 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:52:23.333680 543705 disk_info.go:125] begin check local disk info of client
I0320 07:52:23.336144 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:52:23.336151 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001c65c0 0xc0001c6600]
E0320 07:52:23.407651 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:52:23.407667 543705 memory.go:184] no items to output this cycle
I0320 07:52:23.407680 543705 cpu.go:275] no items to output this cycle
E0320 07:52:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:52:33.409775 543705 memory.go:184] no items to output this cycle
I0320 07:52:33.409798 543705 cpu.go:275] no items to output this cycle
E0320 07:52:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:52:43.409812 543705 memory.go:191] Add success.
I0320 07:52:43.409820 543705 cpu.go:282] Add success.
I0320 07:52:43.419980 543705 net.go:648] Add success.
I0320 07:52:43.422853 543705 net.go:770] primary dev: ETH0
I0320 07:52:43.422867 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:52:43.423042 543705 net.go:698] Add success.
I0320 07:52:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:52:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:52:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:52:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:52:53.409801 543705 memory.go:184] no items to output this cycle
I0320 07:52:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 07:53:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:53:03.409796 543705 memory.go:184] no items to output this cycle
I0320 07:53:03.409808 543705 cpu.go:275] no items to output this cycle
E0320 07:53:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:53:13.409822 543705 memory.go:191] Add success.
I0320 07:53:13.409832 543705 cpu.go:282] Add success.
W0320 07:53:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:53:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:53:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:53:13.420147 543705 net.go:648] Add success.
I0320 07:53:13.422838 543705 net.go:770] primary dev: ETH0
I0320 07:53:13.422851 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:53:13.422863 543705 net.go:698] Add success.
I0320 07:53:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:53:14.455116 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:53:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 07:53:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:53:14.456583 543705 disk_worker.go:494] system disk:vda1
I0320 07:53:14.456613 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:53:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:53:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:53:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:53:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:53:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:53:23.337675 543705 disk_info.go:125] begin check local disk info of client
I0320 07:53:23.340145 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:53:23.340152 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fbcc0 0xc0001fbd00]
E0320 07:53:23.407624 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:53:23.407641 543705 memory.go:184] no items to output this cycle
I0320 07:53:23.407663 543705 cpu.go:275] no items to output this cycle
E0320 07:53:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:53:33.409783 543705 memory.go:184] no items to output this cycle
I0320 07:53:33.409793 543705 cpu.go:275] no items to output this cycle
E0320 07:53:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:53:43.409781 543705 memory.go:191] Add success.
I0320 07:53:43.409805 543705 cpu.go:282] Add success.
I0320 07:53:43.419848 543705 net.go:648] Add success.
I0320 07:53:43.423062 543705 net.go:770] primary dev: ETH0
I0320 07:53:43.423074 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:53:43.423087 543705 net.go:698] Add success.
I0320 07:53:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:53:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:53:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:53:53.409875 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:53:53.409890 543705 cpu.go:275] no items to output this cycle
I0320 07:53:53.409893 543705 memory.go:184] no items to output this cycle
E0320 07:54:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:54:03.409784 543705 memory.go:184] no items to output this cycle
I0320 07:54:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 07:54:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:54:13.409781 543705 memory.go:191] Add success.
W0320 07:54:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 07:54:13.409815 543705 cpu.go:282] Add success.
W0320 07:54:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:54:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:54:13.420123 543705 net.go:648] Add success.
I0320 07:54:13.423122 543705 net.go:770] primary dev: ETH0
I0320 07:54:13.423136 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:54:13.423150 543705 net.go:698] Add success.
I0320 07:54:13.469791 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f003605c-5bf3-49d5-9ed8-81a78c78c95c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 07:54:13.469823 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 07:54:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:54:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:54:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 07:54:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:54:14.456574 543705 disk_worker.go:494] system disk:vda1
I0320 07:54:14.456618 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:54:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:54:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:54:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:54:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:54:16.472399 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:54:23.341673 543705 disk_info.go:125] begin check local disk info of client
I0320 07:54:23.344116 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:54:23.344123 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa180 0xc0001fa1c0]
E0320 07:54:23.407521 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:54:23.407533 543705 memory.go:184] no items to output this cycle
I0320 07:54:23.407533 543705 cpu.go:275] no items to output this cycle
E0320 07:54:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:54:33.409784 543705 cpu.go:275] no items to output this cycle
I0320 07:54:33.409792 543705 memory.go:184] no items to output this cycle
I0320 07:54:38.285732 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 07:54:38.285738 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 07:54:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:54:43.410674 543705 memory.go:191] Add success.
I0320 07:54:43.409820 543705 cpu.go:282] Add success.
I0320 07:54:43.420442 543705 net.go:648] Add success.
I0320 07:54:43.423582 543705 net.go:770] primary dev: ETH0
I0320 07:54:43.423595 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:54:43.423608 543705 net.go:698] Add success.
I0320 07:54:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:54:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:54:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:54:53.409867 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:54:53.409883 543705 memory.go:184] no items to output this cycle
I0320 07:54:53.409955 543705 cpu.go:275] no items to output this cycle
E0320 07:55:03.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:55:03.409763 543705 memory.go:184] no items to output this cycle
I0320 07:55:03.409806 543705 cpu.go:275] no items to output this cycle
E0320 07:55:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:55:13.409790 543705 memory.go:191] Add success.
W0320 07:55:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 07:55:13.409820 543705 cpu.go:282] Add success.
W0320 07:55:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:55:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:55:13.420121 543705 net.go:648] Add success.
I0320 07:55:13.423061 543705 net.go:770] primary dev: ETH0
I0320 07:55:13.423073 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:55:13.423085 543705 net.go:698] Add success.
I0320 07:55:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:55:14.455102 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:55:14.455164 543705 disk_worker.go:708] disk space is not compliant
W0320 07:55:14.455167 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:55:14.456545 543705 disk_worker.go:494] system disk:vda1
I0320 07:55:14.456591 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:55:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:55:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:55:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:55:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:55:16.472388 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:55:23.345677 543705 disk_info.go:125] begin check local disk info of client
I0320 07:55:23.348171 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:55:23.348177 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048c4c0 0xc00048c500]
E0320 07:55:23.407625 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:55:23.407640 543705 memory.go:184] no items to output this cycle
I0320 07:55:23.407653 543705 cpu.go:275] no items to output this cycle
E0320 07:55:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:55:33.409791 543705 memory.go:184] no items to output this cycle
I0320 07:55:33.409791 543705 cpu.go:275] no items to output this cycle
E0320 07:55:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:55:43.409794 543705 memory.go:191] Add success.
I0320 07:55:43.409796 543705 cpu.go:282] Add success.
I0320 07:55:43.419881 543705 net.go:648] Add success.
I0320 07:55:43.422876 543705 net.go:770] primary dev: ETH0
I0320 07:55:43.422890 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:55:43.422905 543705 net.go:698] Add success.
I0320 07:55:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:55:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:55:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:55:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:55:53.409843 543705 memory.go:184] no items to output this cycle
I0320 07:55:53.409918 543705 cpu.go:275] no items to output this cycle
E0320 07:56:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:56:03.409779 543705 memory.go:184] no items to output this cycle
I0320 07:56:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 07:56:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:56:13.409796 543705 memory.go:191] Add success.
I0320 07:56:13.409798 543705 cpu.go:282] Add success.
W0320 07:56:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:56:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:56:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:56:13.420044 543705 net.go:648] Add success.
I0320 07:56:13.422512 543705 net.go:770] primary dev: ETH0
I0320 07:56:13.422536 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:56:13.422550 543705 net.go:698] Add success.
I0320 07:56:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:56:14.455188 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:56:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 07:56:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:56:14.456610 543705 disk_worker.go:494] system disk:vda1
I0320 07:56:14.456641 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:56:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:56:16.457963 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:56:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:56:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:56:16.472379 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:56:23.349685 543705 disk_info.go:125] begin check local disk info of client
I0320 07:56:23.352152 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:56:23.352158 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048ce80 0xc00048cec0]
E0320 07:56:23.407511 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:56:23.407525 543705 memory.go:184] no items to output this cycle
I0320 07:56:23.407554 543705 cpu.go:275] no items to output this cycle
E0320 07:56:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:56:33.409778 543705 memory.go:184] no items to output this cycle
I0320 07:56:33.409785 543705 cpu.go:275] no items to output this cycle
E0320 07:56:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:56:43.409789 543705 memory.go:191] Add success.
I0320 07:56:43.409790 543705 cpu.go:282] Add success.
I0320 07:56:43.419882 543705 net.go:648] Add success.
I0320 07:56:43.422699 543705 net.go:770] primary dev: ETH0
I0320 07:56:43.422713 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:56:43.422727 543705 net.go:698] Add success.
I0320 07:56:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:56:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:56:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:56:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:56:53.409793 543705 memory.go:184] no items to output this cycle
I0320 07:56:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 07:57:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:57:03.409788 543705 cpu.go:275] no items to output this cycle
I0320 07:57:03.409790 543705 memory.go:184] no items to output this cycle
E0320 07:57:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:57:13.409822 543705 memory.go:191] Add success.
I0320 07:57:13.409834 543705 cpu.go:282] Add success.
W0320 07:57:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:57:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:57:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:57:13.420168 543705 net.go:648] Add success.
I0320 07:57:13.422972 543705 net.go:770] primary dev: ETH0
I0320 07:57:13.422987 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:57:13.423000 543705 net.go:698] Add success.
I0320 07:57:13.429689 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 07:57:13.452927 543705 event_worker.go:152] Polling the log file for events...
I0320 07:57:13.470395 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"89b60826-e737-4da2-94e5-6f66f625db4b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 07:57:13.470428 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 07:57:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:57:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 07:57:14.455184 543705 disk_worker.go:728] disk inode is not compliant
E0320 07:57:14.455881 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 07:57:14.455889 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 07:57:14.455894 543705 custom_config.go:64] query custom config with name: gpu
I0320 07:57:14.456534 543705 disk_worker.go:494] system disk:vda1
I0320 07:57:14.456564 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 07:57:15.456802 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 07:57:15.456811 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:57:16.457959 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 07:57:16.457969 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 07:57:16.458009 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:57:16.458026 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:57:16.472361 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:57:23.353678 543705 disk_info.go:125] begin check local disk info of client
I0320 07:57:23.356180 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:57:23.356187 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b140 0xc00007b180]
E0320 07:57:23.407589 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:57:23.407613 543705 memory.go:184] no items to output this cycle
I0320 07:57:23.407627 543705 cpu.go:275] no items to output this cycle
E0320 07:57:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:57:33.409792 543705 cpu.go:275] no items to output this cycle
I0320 07:57:33.409792 543705 memory.go:184] no items to output this cycle
I0320 07:57:38.289735 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 07:57:38.289742 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 07:57:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:57:43.410672 543705 memory.go:191] Add success.
I0320 07:57:43.409799 543705 cpu.go:282] Add success.
I0320 07:57:43.420348 543705 net.go:648] Add success.
I0320 07:57:43.423257 543705 net.go:770] primary dev: ETH0
I0320 07:57:43.423270 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:57:43.423283 543705 net.go:698] Add success.
I0320 07:57:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:57:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:57:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:57:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:57:53.409800 543705 memory.go:184] no items to output this cycle
I0320 07:57:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 07:58:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:58:03.409781 543705 memory.go:184] no items to output this cycle
I0320 07:58:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 07:58:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:58:13.409809 543705 cpu.go:282] Add success.
I0320 07:58:13.409815 543705 memory.go:191] Add success.
W0320 07:58:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:58:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:58:13.409860 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:58:13.420083 543705 net.go:648] Add success.
I0320 07:58:13.423174 543705 net.go:770] primary dev: ETH0
I0320 07:58:13.423193 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:58:13.423208 543705 net.go:698] Add success.
I0320 07:58:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:58:14.455193 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:58:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0320 07:58:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:58:14.456597 543705 disk_worker.go:494] system disk:vda1
I0320 07:58:14.456629 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:58:15.455975 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:58:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:58:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:58:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:58:16.472382 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:58:23.357680 543705 disk_info.go:125] begin check local disk info of client
I0320 07:58:23.360159 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:58:23.360165 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048c240 0xc00048c280]
E0320 07:58:23.407557 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:58:23.407573 543705 memory.go:184] no items to output this cycle
I0320 07:58:23.407588 543705 cpu.go:275] no items to output this cycle
E0320 07:58:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:58:33.409798 543705 memory.go:184] no items to output this cycle
I0320 07:58:33.409802 543705 cpu.go:275] no items to output this cycle
E0320 07:58:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:58:43.409807 543705 memory.go:191] Add success.
I0320 07:58:43.409811 543705 cpu.go:282] Add success.
I0320 07:58:43.419987 543705 net.go:648] Add success.
I0320 07:58:43.423211 543705 net.go:770] primary dev: ETH0
I0320 07:58:43.423226 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:58:43.423238 543705 net.go:698] Add success.
I0320 07:58:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:58:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:58:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:58:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:58:53.409787 543705 memory.go:184] no items to output this cycle
I0320 07:58:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 07:59:03.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:59:03.409814 543705 memory.go:184] no items to output this cycle
I0320 07:59:03.409827 543705 cpu.go:275] no items to output this cycle
E0320 07:59:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:59:13.409791 543705 memory.go:191] Add success.
I0320 07:59:13.409809 543705 cpu.go:282] Add success.
W0320 07:59:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 07:59:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 07:59:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 07:59:13.420260 543705 net.go:648] Add success.
I0320 07:59:13.423217 543705 net.go:770] primary dev: ETH0
I0320 07:59:13.423230 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:59:13.423242 543705 net.go:698] Add success.
I0320 07:59:14.454980 543705 custom_config.go:64] query custom config with name: gpu
W0320 07:59:14.455099 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 07:59:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0320 07:59:14.455165 543705 disk_worker.go:728] disk inode is not compliant
I0320 07:59:14.456495 543705 disk_worker.go:494] system disk:vda1
I0320 07:59:14.456540 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 07:59:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 07:59:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:59:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:59:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 07:59:16.472433 543705 disk_local_worker.go:436] Get disk info: []
I0320 07:59:23.361673 543705 disk_info.go:125] begin check local disk info of client
I0320 07:59:23.364178 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 07:59:23.364185 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001c7ac0 0xc0001c7b00]
E0320 07:59:23.407552 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:59:23.407568 543705 memory.go:184] no items to output this cycle
I0320 07:59:23.407587 543705 cpu.go:275] no items to output this cycle
E0320 07:59:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:59:33.409787 543705 memory.go:184] no items to output this cycle
I0320 07:59:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 07:59:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:59:43.409811 543705 memory.go:191] Add success.
I0320 07:59:43.409818 543705 cpu.go:282] Add success.
I0320 07:59:43.419904 543705 net.go:648] Add success.
I0320 07:59:43.422981 543705 net.go:770] primary dev: ETH0
I0320 07:59:43.422995 543705 net.go:802] Send network stats successfully!,count is 6
I0320 07:59:43.423009 543705 net.go:698] Add success.
I0320 07:59:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 07:59:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 07:59:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 07:59:53.410347 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 07:59:53.410361 543705 memory.go:184] no items to output this cycle
I0320 07:59:53.410394 543705 cpu.go:275] no items to output this cycle
E0320 08:00:03.409858 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:00:03.409888 543705 memory.go:184] no items to output this cycle
I0320 08:00:03.409912 543705 cpu.go:275] no items to output this cycle
E0320 08:00:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:00:13.409789 543705 memory.go:191] Add success.
W0320 08:00:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 08:00:13.409821 543705 cpu.go:282] Add success.
W0320 08:00:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:00:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:00:13.420132 543705 net.go:648] Add success.
I0320 08:00:13.422796 543705 net.go:770] primary dev: ETH0
I0320 08:00:13.422808 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:00:13.422820 543705 net.go:698] Add success.
I0320 08:00:13.469151 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4a391497-42dc-4df4-b7a6-06e6c9266cf6","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 08:00:13.469183 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 08:00:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:00:14.455148 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:00:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0320 08:00:14.455161 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:00:14.456527 543705 disk_worker.go:494] system disk:vda1
I0320 08:00:14.456571 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:00:15.455974 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:00:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:00:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:00:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:00:16.472419 543705 disk_local_worker.go:436] Get disk info: []
I0320 08:00:23.365674 543705 disk_info.go:125] begin check local disk info of client
I0320 08:00:23.368174 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:00:23.368180 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb800 0xc0001fb840]
E0320 08:00:23.407533 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:00:23.407551 543705 memory.go:184] no items to output this cycle
I0320 08:00:23.407566 543705 cpu.go:275] no items to output this cycle
E0320 08:00:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:00:33.409788 543705 memory.go:184] no items to output this cycle
I0320 08:00:33.409791 543705 cpu.go:275] no items to output this cycle
I0320 08:00:38.293730 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 08:00:38.293736 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 08:00:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:00:43.410695 543705 memory.go:191] Add success.
I0320 08:00:43.409816 543705 cpu.go:282] Add success.
I0320 08:00:43.420468 543705 net.go:648] Add success.
I0320 08:00:43.423515 543705 net.go:770] primary dev: ETH0
I0320 08:00:43.423530 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:00:43.423543 543705 net.go:698] Add success.
I0320 08:00:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:00:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:00:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:00:53.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:00:53.409760 543705 memory.go:184] no items to output this cycle
I0320 08:00:53.409784 543705 cpu.go:275] no items to output this cycle
E0320 08:01:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:01:03.409782 543705 memory.go:184] no items to output this cycle
I0320 08:01:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 08:01:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:01:13.409792 543705 cpu.go:282] Add success.
I0320 08:01:13.409797 543705 memory.go:191] Add success.
W0320 08:01:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:01:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:01:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:01:13.420103 543705 net.go:648] Add success.
I0320 08:01:13.422906 543705 net.go:770] primary dev: ETH0
I0320 08:01:13.422921 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:01:13.422935 543705 net.go:698] Add success.
I0320 08:01:14.454983 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:01:14.455205 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:01:14.455217 543705 disk_worker.go:708] disk space is not compliant
W0320 08:01:14.455220 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:01:14.456613 543705 disk_worker.go:494] system disk:vda1
I0320 08:01:14.456647 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:01:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:01:16.458009 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:01:16.458072 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:01:16.458093 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:01:16.472458 543705 disk_local_worker.go:436] Get disk info: []
I0320 08:01:23.369674 543705 disk_info.go:125] begin check local disk info of client
I0320 08:01:23.372113 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:01:23.372120 543705 disk_info.go:196] parse disk info done, disk is : [0xc000326140 0xc000326180]
E0320 08:01:23.407462 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:01:23.407476 543705 memory.go:184] no items to output this cycle
I0320 08:01:23.407483 543705 cpu.go:275] no items to output this cycle
E0320 08:01:33.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:01:33.409811 543705 memory.go:184] no items to output this cycle
I0320 08:01:33.409823 543705 cpu.go:275] no items to output this cycle
E0320 08:01:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:01:43.409777 543705 memory.go:191] Add success.
I0320 08:01:43.409803 543705 cpu.go:282] Add success.
I0320 08:01:43.419879 543705 net.go:648] Add success.
I0320 08:01:43.423131 543705 net.go:770] primary dev: ETH0
I0320 08:01:43.423144 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:01:43.423157 543705 net.go:698] Add success.
I0320 08:01:46.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:01:46.458064 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:01:46.458091 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:01:53.409861 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:01:53.409885 543705 memory.go:184] no items to output this cycle
I0320 08:01:53.409972 543705 cpu.go:275] no items to output this cycle
E0320 08:02:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:02:03.409773 543705 memory.go:184] no items to output this cycle
I0320 08:02:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 08:02:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:02:13.409799 543705 memory.go:191] Add success.
I0320 08:02:13.409815 543705 cpu.go:282] Add success.
W0320 08:02:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:02:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:02:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:02:13.420507 543705 net.go:648] Add success.
I0320 08:02:13.423312 543705 net.go:770] primary dev: ETH0
I0320 08:02:13.423331 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:02:13.423352 543705 net.go:698] Add success.
W0320 08:02:14.455094 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:02:14.455155 543705 disk_worker.go:708] disk space is not compliant
W0320 08:02:14.455158 543705 disk_worker.go:728] disk inode is not compliant
E0320 08:02:14.456931 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 08:02:14.456940 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 08:02:14.456947 543705 custom_config.go:64] query custom config with name: gpu
I0320 08:02:14.457030 543705 disk_worker.go:494] system disk:vda1
I0320 08:02:14.457061 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 08:02:15.456776 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 08:02:15.456786 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:02:16.457943 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 08:02:16.457943 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 08:02:16.457997 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:02:16.458029 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:02:16.472347 543705 disk_local_worker.go:436] Get disk info: []
I0320 08:02:23.373674 543705 disk_info.go:125] begin check local disk info of client
I0320 08:02:23.376116 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:02:23.376123 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab900 0xc0001ab940]
E0320 08:02:23.408483 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:02:23.408497 543705 memory.go:184] no items to output this cycle
I0320 08:02:23.408516 543705 cpu.go:275] no items to output this cycle
E0320 08:02:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:02:33.409784 543705 memory.go:184] no items to output this cycle
I0320 08:02:33.409787 543705 cpu.go:275] no items to output this cycle
E0320 08:02:43.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:02:43.409772 543705 memory.go:191] Add success.
I0320 08:02:43.409807 543705 cpu.go:282] Add success.
I0320 08:02:43.419945 543705 net.go:648] Add success.
I0320 08:02:43.422996 543705 net.go:770] primary dev: ETH0
I0320 08:02:43.423012 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:02:43.423054 543705 net.go:698] Add success.
I0320 08:02:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:02:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:02:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:02:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:02:53.409802 543705 memory.go:184] no items to output this cycle
I0320 08:02:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 08:03:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:03:03.409765 543705 memory.go:184] no items to output this cycle
I0320 08:03:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 08:03:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:03:13.409792 543705 memory.go:191] Add success.
I0320 08:03:13.409812 543705 cpu.go:282] Add success.
W0320 08:03:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:03:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:03:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:03:13.420208 543705 net.go:648] Add success.
I0320 08:03:13.422817 543705 net.go:770] primary dev: ETH0
I0320 08:03:13.422832 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:03:13.422851 543705 net.go:698] Add success.
I0320 08:03:13.468888 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"975e0f8a-ae2a-4b43-ab91-698c30b2c0b6","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 08:03:13.468923 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 08:03:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:03:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:03:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0320 08:03:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:03:14.456557 543705 disk_worker.go:494] system disk:vda1
I0320 08:03:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:03:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:03:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:03:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:03:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:03:16.472396 543705 disk_local_worker.go:436] Get disk info: []
I0320 08:03:23.377675 543705 disk_info.go:125] begin check local disk info of client
I0320 08:03:23.380180 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:03:23.380187 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa3c0 0xc0001aa400]
E0320 08:03:23.407443 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:03:23.407451 543705 cpu.go:275] no items to output this cycle
I0320 08:03:23.407454 543705 memory.go:184] no items to output this cycle
E0320 08:03:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:03:33.409780 543705 memory.go:184] no items to output this cycle
I0320 08:03:33.409803 543705 cpu.go:275] no items to output this cycle
I0320 08:03:38.297743 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 08:03:38.297749 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 08:03:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:03:43.411072 543705 memory.go:191] Add success.
I0320 08:03:43.409815 543705 cpu.go:282] Add success.
I0320 08:03:43.419977 543705 net.go:648] Add success.
I0320 08:03:43.423374 543705 net.go:770] primary dev: ETH0
I0320 08:03:43.423388 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:03:43.423399 543705 net.go:698] Add success.
I0320 08:03:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:03:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:03:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:03:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:03:53.409774 543705 memory.go:184] no items to output this cycle
I0320 08:03:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 08:04:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:04:03.409806 543705 memory.go:184] no items to output this cycle
I0320 08:04:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 08:04:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:04:13.409786 543705 memory.go:191] Add success.
W0320 08:04:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 08:04:13.409814 543705 cpu.go:282] Add success.
W0320 08:04:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:04:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:04:13.420484 543705 net.go:648] Add success.
I0320 08:04:13.424162 543705 net.go:770] primary dev: ETH0
I0320 08:04:13.424178 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:04:13.424192 543705 net.go:698] Add success.
I0320 08:04:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:04:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:04:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0320 08:04:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:04:14.456520 543705 disk_worker.go:494] system disk:vda1
I0320 08:04:14.456566 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:04:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:04:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:04:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:04:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:04:16.472402 543705 disk_local_worker.go:436] Get disk info: []
I0320 08:04:23.381683 543705 disk_info.go:125] begin check local disk info of client
I0320 08:04:23.384176 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:04:23.384183 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab240 0xc0001ab280]
E0320 08:04:23.408470 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:04:23.408482 543705 memory.go:184] no items to output this cycle
I0320 08:04:23.408488 543705 cpu.go:275] no items to output this cycle
E0320 08:04:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:04:33.409808 543705 memory.go:184] no items to output this cycle
I0320 08:04:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 08:04:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:04:43.409792 543705 cpu.go:282] Add success.
I0320 08:04:43.409800 543705 memory.go:191] Add success.
I0320 08:04:43.420045 543705 net.go:648] Add success.
I0320 08:04:43.423090 543705 net.go:770] primary dev: ETH0
I0320 08:04:43.423103 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:04:43.423115 543705 net.go:698] Add success.
I0320 08:04:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:04:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:04:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:04:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:04:53.409774 543705 memory.go:184] no items to output this cycle
I0320 08:04:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 08:05:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:05:03.409803 543705 memory.go:184] no items to output this cycle
I0320 08:05:03.409815 543705 cpu.go:275] no items to output this cycle
E0320 08:05:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:05:13.409794 543705 cpu.go:282] Add success.
I0320 08:05:13.409801 543705 memory.go:191] Add success.
W0320 08:05:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:05:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:05:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:05:13.420198 543705 net.go:648] Add success.
I0320 08:05:13.422982 543705 net.go:770] primary dev: ETH0
I0320 08:05:13.422995 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:05:13.423007 543705 net.go:698] Add success.
I0320 08:05:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:05:14.455136 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:05:14.455148 543705 disk_worker.go:708] disk space is not compliant
W0320 08:05:14.455151 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:05:14.456493 543705 disk_worker.go:494] system disk:vda1
I0320 08:05:14.456533 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:05:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:05:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:05:16.458024 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:05:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:05:16.472385 543705 disk_local_worker.go:436] Get disk info: []
I0320 08:05:23.385683 543705 disk_info.go:125] begin check local disk info of client
I0320 08:05:23.388142 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:05:23.388148 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a0080 0xc0004a00c0]
E0320 08:05:23.408438 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:05:23.408451 543705 memory.go:184] no items to output this cycle
I0320 08:05:23.408457 543705 cpu.go:275] no items to output this cycle
E0320 08:05:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:05:33.409809 543705 memory.go:184] no items to output this cycle
I0320 08:05:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 08:05:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:05:43.409795 543705 memory.go:191] Add success.
I0320 08:05:43.409794 543705 cpu.go:282] Add success.
I0320 08:05:43.419984 543705 net.go:648] Add success.
I0320 08:05:43.422720 543705 net.go:770] primary dev: ETH0
I0320 08:05:43.422734 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:05:43.422745 543705 net.go:698] Add success.
I0320 08:05:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:05:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:05:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:05:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:05:53.409787 543705 memory.go:184] no items to output this cycle
I0320 08:05:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 08:06:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:06:03.409796 543705 memory.go:184] no items to output this cycle
I0320 08:06:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 08:06:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:06:13.409822 543705 memory.go:191] Add success.
I0320 08:06:13.409833 543705 cpu.go:282] Add success.
W0320 08:06:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:06:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:06:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:06:13.420434 543705 net.go:648] Add success.
I0320 08:06:13.423364 543705 net.go:770] primary dev: ETH0
I0320 08:06:13.423378 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:06:13.423392 543705 net.go:698] Add success.
I0320 08:06:13.567492 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5d617f98-5e2f-44d1-8ec3-53e34365f2c9","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 08:06:13.567530 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 08:06:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:06:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:06:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 08:06:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:06:14.456625 543705 disk_worker.go:494] system disk:vda1
I0320 08:06:14.456656 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:06:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:06:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:06:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:06:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:06:16.472442 543705 disk_local_worker.go:436] Get disk info: []
I0320 08:06:23.389673 543705 disk_info.go:125] begin check local disk info of client
I0320 08:06:23.392131 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:06:23.392137 543705 disk_info.go:196] parse disk info done, disk is : [0xc00053cb40 0xc00053cb80]
E0320 08:06:23.408384 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:06:23.408396 543705 memory.go:184] no items to output this cycle
I0320 08:06:23.408429 543705 cpu.go:275] no items to output this cycle
E0320 08:06:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:06:33.409770 543705 memory.go:184] no items to output this cycle
I0320 08:06:33.409800 543705 cpu.go:275] no items to output this cycle
I0320 08:06:38.301729 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 08:06:38.301735 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 08:06:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:06:43.410809 543705 memory.go:191] Add success.
I0320 08:06:43.409798 543705 cpu.go:282] Add success.
I0320 08:06:43.420619 543705 net.go:648] Add success.
I0320 08:06:43.423762 543705 net.go:770] primary dev: ETH0
I0320 08:06:43.423775 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:06:43.423788 543705 net.go:698] Add success.
I0320 08:06:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:06:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:06:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:06:53.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:06:53.409762 543705 memory.go:184] no items to output this cycle
I0320 08:06:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 08:07:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:07:03.409799 543705 memory.go:184] no items to output this cycle
I0320 08:07:03.409809 543705 cpu.go:275] no items to output this cycle
E0320 08:07:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:07:13.409788 543705 memory.go:191] Add success.
I0320 08:07:13.409808 543705 cpu.go:282] Add success.
W0320 08:07:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:07:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:07:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:07:13.420204 543705 net.go:648] Add success.
I0320 08:07:13.423030 543705 net.go:770] primary dev: ETH0
I0320 08:07:13.423042 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:07:13.423054 543705 net.go:698] Add success.
I0320 08:07:13.453623 543705 event_worker.go:152] Polling the log file for events...
W0320 08:07:14.455164 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:07:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0320 08:07:14.455181 543705 disk_worker.go:728] disk inode is not compliant
E0320 08:07:14.457005 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 08:07:14.457015 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 08:07:14.457021 543705 custom_config.go:64] query custom config with name: gpu
I0320 08:07:14.457046 543705 disk_worker.go:494] system disk:vda1
I0320 08:07:14.457074 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 08:07:15.456840 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 08:07:15.456848 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:07:16.457950 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 08:07:16.457949 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 08:07:16.458005 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:07:16.458024 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:07:16.472356 543705 disk_local_worker.go:436] Get disk info: []
I0320 08:07:23.393671 543705 disk_info.go:125] begin check local disk info of client
I0320 08:07:23.396139 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:07:23.396146 543705 disk_info.go:196] parse disk info done, disk is : [0xc000327300 0xc000327340]
E0320 08:07:23.408375 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:07:23.408386 543705 memory.go:184] no items to output this cycle
I0320 08:07:23.408425 543705 cpu.go:275] no items to output this cycle
E0320 08:07:33.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:07:33.409818 543705 memory.go:184] no items to output this cycle
I0320 08:07:33.409828 543705 cpu.go:275] no items to output this cycle
E0320 08:07:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:07:43.409791 543705 memory.go:191] Add success.
I0320 08:07:43.409794 543705 cpu.go:282] Add success.
I0320 08:07:43.420154 543705 net.go:648] Add success.
I0320 08:07:43.423082 543705 net.go:770] primary dev: ETH0
I0320 08:07:43.423095 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:07:43.423107 543705 net.go:698] Add success.
I0320 08:07:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:07:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:07:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:07:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:07:53.409779 543705 memory.go:184] no items to output this cycle
I0320 08:07:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 08:08:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:08:03.409769 543705 memory.go:184] no items to output this cycle
I0320 08:08:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 08:08:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:08:13.409829 543705 memory.go:191] Add success.
I0320 08:08:13.409837 543705 cpu.go:282] Add success.
W0320 08:08:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:08:13.409878 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:08:13.409882 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:08:13.420341 543705 net.go:648] Add success.
I0320 08:08:13.423177 543705 net.go:770] primary dev: ETH0
I0320 08:08:13.423190 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:08:13.423202 543705 net.go:698] Add success.
I0320 08:08:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:08:14.455100 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:08:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0320 08:08:14.455165 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:08:14.456515 543705 disk_worker.go:494] system disk:vda1
I0320 08:08:14.456561 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:08:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:08:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:08:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:08:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:08:16.472418 543705 disk_local_worker.go:436] Get disk info: []
I0320 08:08:23.397673 543705 disk_info.go:125] begin check local disk info of client
I0320 08:08:23.400081 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:08:23.400087 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b400 0xc00007b440]
E0320 08:08:23.408337 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:08:23.408355 543705 memory.go:184] no items to output this cycle
I0320 08:08:23.408370 543705 cpu.go:275] no items to output this cycle
E0320 08:08:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:08:33.409773 543705 memory.go:184] no items to output this cycle
I0320 08:08:33.409803 543705 cpu.go:275] no items to output this cycle
E0320 08:08:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:08:43.409814 543705 memory.go:191] Add success.
I0320 08:08:43.409819 543705 cpu.go:282] Add success.
I0320 08:08:43.419860 543705 net.go:648] Add success.
I0320 08:08:43.422512 543705 net.go:770] primary dev: ETH0
I0320 08:08:43.422528 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:08:43.422540 543705 net.go:698] Add success.
I0320 08:08:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:08:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:08:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:08:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:08:53.409783 543705 cpu.go:275] no items to output this cycle
I0320 08:08:53.409785 543705 memory.go:184] no items to output this cycle
E0320 08:09:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:09:03.409797 543705 memory.go:184] no items to output this cycle
I0320 08:09:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 08:09:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:09:13.409825 543705 memory.go:191] Add success.
I0320 08:09:13.409831 543705 cpu.go:282] Add success.
W0320 08:09:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:09:13.409875 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:09:13.409879 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:09:13.420142 543705 net.go:648] Add success.
I0320 08:09:13.423167 543705 net.go:770] primary dev: ETH0
I0320 08:09:13.423183 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:09:13.423198 543705 net.go:698] Add success.
I0320 08:09:13.593141 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"00e97b56-d1c4-4e93-a732-7751d368fe62","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 08:09:13.593175 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 08:09:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:09:14.455214 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:09:14.455224 543705 disk_worker.go:708] disk space is not compliant
W0320 08:09:14.455227 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:09:14.456758 543705 disk_worker.go:494] system disk:vda1
I0320 08:09:14.456786 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:09:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:09:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:09:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:09:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:09:16.472380 543705 disk_local_worker.go:436] Get disk info: []
I0320 08:09:23.401672 543705 disk_info.go:125] begin check local disk info of client
I0320 08:09:23.404074 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:09:23.404080 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4e40 0xc0000c4e80]
E0320 08:09:23.408338 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:09:23.408353 543705 memory.go:184] no items to output this cycle
I0320 08:09:23.408365 543705 cpu.go:275] no items to output this cycle
E0320 08:09:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:09:33.409784 543705 memory.go:184] no items to output this cycle
I0320 08:09:33.409791 543705 cpu.go:275] no items to output this cycle
I0320 08:09:38.305735 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 08:09:38.305743 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 08:09:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:09:43.410858 543705 memory.go:191] Add success.
I0320 08:09:43.409815 543705 cpu.go:282] Add success.
I0320 08:09:43.420718 543705 net.go:648] Add success.
I0320 08:09:43.423697 543705 net.go:770] primary dev: ETH0
I0320 08:09:43.423709 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:09:43.423721 543705 net.go:698] Add success.
I0320 08:09:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:09:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:09:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:09:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:09:53.409796 543705 memory.go:184] no items to output this cycle
I0320 08:09:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 08:10:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:10:03.409767 543705 memory.go:184] no items to output this cycle
I0320 08:10:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 08:10:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:10:13.409794 543705 memory.go:191] Add success.
I0320 08:10:13.409809 543705 cpu.go:282] Add success.
W0320 08:10:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:10:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:10:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:10:13.420151 543705 net.go:648] Add success.
I0320 08:10:13.423018 543705 net.go:770] primary dev: ETH0
I0320 08:10:13.423031 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:10:13.423044 543705 net.go:698] Add success.
I0320 08:10:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:10:14.455108 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:10:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0320 08:10:14.455180 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:10:14.456602 543705 disk_worker.go:494] system disk:vda1
I0320 08:10:14.456631 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:10:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:10:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:10:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:10:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:10:16.472377 543705 disk_local_worker.go:436] Get disk info: []
I0320 08:10:23.405671 543705 disk_info.go:125] begin check local disk info of client
E0320 08:10:23.407926 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:10:23.407945 543705 memory.go:184] no items to output this cycle
I0320 08:10:23.407958 543705 cpu.go:275] no items to output this cycle
I0320 08:10:23.408123 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:10:23.408127 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aba00 0xc0001aba40]
E0320 08:10:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:10:33.409773 543705 memory.go:184] no items to output this cycle
I0320 08:10:33.409805 543705 cpu.go:275] no items to output this cycle
E0320 08:10:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:10:43.409821 543705 memory.go:191] Add success.
I0320 08:10:43.409822 543705 cpu.go:282] Add success.
I0320 08:10:43.420100 543705 net.go:648] Add success.
I0320 08:10:43.423069 543705 net.go:770] primary dev: ETH0
I0320 08:10:43.423082 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:10:43.423093 543705 net.go:698] Add success.
I0320 08:10:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:10:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:10:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:10:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:10:53.409774 543705 memory.go:184] no items to output this cycle
I0320 08:10:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 08:11:03.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:11:03.409762 543705 memory.go:184] no items to output this cycle
I0320 08:11:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 08:11:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:11:13.409794 543705 memory.go:191] Add success.
I0320 08:11:13.409815 543705 cpu.go:282] Add success.
W0320 08:11:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:11:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:11:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:11:13.420326 543705 net.go:648] Add success.
I0320 08:11:13.422963 543705 net.go:770] primary dev: ETH0
I0320 08:11:13.422978 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:11:13.422993 543705 net.go:698] Add success.
I0320 08:11:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:11:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:11:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0320 08:11:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:11:14.456539 543705 disk_worker.go:494] system disk:vda1
I0320 08:11:14.456572 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:11:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:11:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:11:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:11:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:11:16.472361 543705 disk_local_worker.go:436] Get disk info: []
I0320 08:11:23.409696 543705 disk_info.go:125] begin check local disk info of client
I0320 08:11:23.409853 543705 cpu.go:275] no items to output this cycle
E0320 08:11:23.409948 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:11:23.409962 543705 memory.go:184] no items to output this cycle
I0320 08:11:23.411981 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:11:23.411987 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab600 0xc0001ab640]
E0320 08:11:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:11:33.409808 543705 memory.go:184] no items to output this cycle
I0320 08:11:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 08:11:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:11:43.409812 543705 memory.go:191] Add success.
I0320 08:11:43.409818 543705 cpu.go:282] Add success.
I0320 08:11:43.419908 543705 net.go:648] Add success.
I0320 08:11:43.422785 543705 net.go:770] primary dev: ETH0
I0320 08:11:43.422799 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:11:43.422813 543705 net.go:698] Add success.
I0320 08:11:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:11:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:11:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:11:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:11:53.409768 543705 memory.go:184] no items to output this cycle
I0320 08:11:53.409895 543705 cpu.go:275] no items to output this cycle
E0320 08:12:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:12:03.409801 543705 memory.go:184] no items to output this cycle
I0320 08:12:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 08:12:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:12:13.409809 543705 memory.go:191] Add success.
I0320 08:12:13.409815 543705 cpu.go:282] Add success.
W0320 08:12:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:12:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:12:13.409855 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:12:13.420299 543705 net.go:648] Add success.
I0320 08:12:13.423428 543705 net.go:770] primary dev: ETH0
I0320 08:12:13.423442 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:12:13.423456 543705 net.go:698] Add success.
I0320 08:12:13.470601 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9611976a-5c5c-4744-a30c-8e8dce64dd95","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 08:12:13.470635 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 08:12:14.455237 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:12:14.455251 543705 disk_worker.go:708] disk space is not compliant
W0320 08:12:14.455255 543705 disk_worker.go:728] disk inode is not compliant
E0320 08:12:14.456105 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 08:12:14.456114 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 08:12:14.456120 543705 custom_config.go:64] query custom config with name: gpu
I0320 08:12:14.457117 543705 disk_worker.go:494] system disk:vda1
I0320 08:12:14.457146 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 08:12:15.456845 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 08:12:15.456854 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:12:16.457962 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 08:12:16.457972 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 08:12:16.458014 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:12:16.458031 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:12:16.472367 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:12:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:12:23.409777 543705 memory.go:184] no items to output this cycle
I0320 08:12:23.409787 543705 cpu.go:275] no items to output this cycle
I0320 08:12:23.412883 543705 disk_info.go:125] begin check local disk info of client
I0320 08:12:23.415282 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:12:23.415287 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a09c0 0xc0004a0a00]
E0320 08:12:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:12:33.409805 543705 memory.go:184] no items to output this cycle
I0320 08:12:33.409821 543705 cpu.go:275] no items to output this cycle
I0320 08:12:38.309739 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 08:12:38.309746 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 08:12:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:12:43.410757 543705 memory.go:191] Add success.
I0320 08:12:43.409824 543705 cpu.go:282] Add success.
I0320 08:12:43.420442 543705 net.go:648] Add success.
I0320 08:12:43.423294 543705 net.go:770] primary dev: ETH0
I0320 08:12:43.423307 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:12:43.423321 543705 net.go:698] Add success.
I0320 08:12:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:12:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:12:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:12:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:12:53.409788 543705 memory.go:184] no items to output this cycle
I0320 08:12:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 08:13:03.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:13:03.409828 543705 memory.go:184] no items to output this cycle
I0320 08:13:03.409836 543705 cpu.go:275] no items to output this cycle
E0320 08:13:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:13:13.409806 543705 memory.go:191] Add success.
W0320 08:13:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 08:13:13.409840 543705 cpu.go:282] Add success.
W0320 08:13:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:13:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:13:13.420601 543705 net.go:648] Add success.
I0320 08:13:13.423474 543705 net.go:770] primary dev: ETH0
I0320 08:13:13.423489 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:13:13.423503 543705 net.go:698] Add success.
I0320 08:13:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:13:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:13:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0320 08:13:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:13:14.456593 543705 disk_worker.go:494] system disk:vda1
I0320 08:13:14.456624 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:13:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:13:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:13:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:13:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:13:16.472382 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:13:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:13:23.409808 543705 memory.go:184] no items to output this cycle
I0320 08:13:23.409821 543705 cpu.go:275] no items to output this cycle
I0320 08:13:23.416029 543705 disk_info.go:125] begin check local disk info of client
I0320 08:13:23.418375 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:13:23.418381 543705 disk_info.go:196] parse disk info done, disk is : [0xc000326880 0xc0003268c0]
E0320 08:13:33.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:13:33.409823 543705 memory.go:184] no items to output this cycle
I0320 08:13:33.409840 543705 cpu.go:275] no items to output this cycle
E0320 08:13:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:13:43.409800 543705 memory.go:191] Add success.
I0320 08:13:43.409844 543705 cpu.go:282] Add success.
I0320 08:13:43.420074 543705 net.go:648] Add success.
I0320 08:13:43.422728 543705 net.go:770] primary dev: ETH0
I0320 08:13:43.422741 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:13:43.422764 543705 net.go:698] Add success.
I0320 08:13:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:13:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:13:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:13:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:13:53.409782 543705 memory.go:184] no items to output this cycle
I0320 08:13:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 08:14:03.409888 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:14:03.409908 543705 memory.go:184] no items to output this cycle
I0320 08:14:03.409983 543705 cpu.go:275] no items to output this cycle
E0320 08:14:13.409809 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:14:13.409847 543705 memory.go:191] Add success.
I0320 08:14:13.409850 543705 cpu.go:282] Add success.
W0320 08:14:13.409878 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:14:13.409890 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:14:13.409893 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:14:13.420217 543705 net.go:648] Add success.
I0320 08:14:13.423299 543705 net.go:770] primary dev: ETH0
I0320 08:14:13.423315 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:14:13.423328 543705 net.go:698] Add success.
I0320 08:14:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:14:14.455090 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:14:14.455155 543705 disk_worker.go:708] disk space is not compliant
W0320 08:14:14.455158 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:14:14.456492 543705 disk_worker.go:494] system disk:vda1
I0320 08:14:14.456534 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:14:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:14:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:14:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:14:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:14:16.472397 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:14:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:14:23.409777 543705 memory.go:184] no items to output this cycle
I0320 08:14:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 08:14:23.419016 543705 disk_info.go:125] begin check local disk info of client
I0320 08:14:23.421422 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:14:23.423499 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b7900 0xc0002b7940]
E0320 08:14:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:14:33.409801 543705 memory.go:184] no items to output this cycle
I0320 08:14:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 08:14:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:14:43.409780 543705 memory.go:191] Add success.
I0320 08:14:43.409808 543705 cpu.go:282] Add success.
I0320 08:14:43.420137 543705 net.go:648] Add success.
I0320 08:14:43.423126 543705 net.go:770] primary dev: ETH0
I0320 08:14:43.423140 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:14:43.423154 543705 net.go:698] Add success.
I0320 08:14:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:14:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:14:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:14:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:14:53.409791 543705 memory.go:184] no items to output this cycle
I0320 08:14:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 08:15:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:15:03.409779 543705 memory.go:184] no items to output this cycle
I0320 08:15:03.409886 543705 cpu.go:275] no items to output this cycle
E0320 08:15:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:15:13.409805 543705 memory.go:191] Add success.
I0320 08:15:13.409805 543705 cpu.go:282] Add success.
W0320 08:15:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:15:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:15:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:15:13.420180 543705 net.go:648] Add success.
I0320 08:15:13.423092 543705 net.go:770] primary dev: ETH0
I0320 08:15:13.423108 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:15:13.423122 543705 net.go:698] Add success.
I0320 08:15:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:15:14.455143 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:15:14.455153 543705 disk_worker.go:708] disk space is not compliant
W0320 08:15:14.455155 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:15:14.456502 543705 disk_worker.go:494] system disk:vda1
I0320 08:15:14.456549 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:15:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:15:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:15:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:15:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:15:16.472391 543705 disk_local_worker.go:436] Get disk info: []
I0320 08:15:16.711707 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7a80b1b9-02a2-46c6-9152-e946897236e8","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 08:15:16.711742 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
E0320 08:15:23.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:15:23.409761 543705 memory.go:184] no items to output this cycle
I0320 08:15:23.409795 543705 cpu.go:275] no items to output this cycle
I0320 08:15:23.424343 543705 disk_info.go:125] begin check local disk info of client
I0320 08:15:23.426772 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:15:23.426777 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa4c0 0xc0001fa500]
E0320 08:15:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:15:33.409789 543705 memory.go:184] no items to output this cycle
I0320 08:15:33.409796 543705 cpu.go:275] no items to output this cycle
I0320 08:15:38.313730 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 08:15:38.313736 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 08:15:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:15:43.410769 543705 memory.go:191] Add success.
I0320 08:15:43.409814 543705 cpu.go:282] Add success.
I0320 08:15:43.420462 543705 net.go:648] Add success.
I0320 08:15:43.423377 543705 net.go:770] primary dev: ETH0
I0320 08:15:43.423390 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:15:43.423409 543705 net.go:698] Add success.
I0320 08:15:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:15:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:15:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:15:53.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:15:53.409759 543705 memory.go:184] no items to output this cycle
I0320 08:15:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 08:16:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:16:03.409803 543705 memory.go:184] no items to output this cycle
I0320 08:16:03.409827 543705 cpu.go:275] no items to output this cycle
E0320 08:16:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:16:13.409799 543705 memory.go:191] Add success.
I0320 08:16:13.409820 543705 cpu.go:282] Add success.
W0320 08:16:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:16:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:16:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:16:13.419753 543705 net.go:648] Add success.
I0320 08:16:13.422523 543705 net.go:770] primary dev: ETH0
I0320 08:16:13.422547 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:16:13.422559 543705 net.go:698] Add success.
I0320 08:16:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:16:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:16:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0320 08:16:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:16:14.456589 543705 disk_worker.go:494] system disk:vda1
I0320 08:16:14.456619 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:16:15.455030 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:16:16.458214 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:16:16.458276 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:16:16.458297 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:16:16.472615 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:16:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:16:23.409769 543705 memory.go:184] no items to output this cycle
I0320 08:16:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 08:16:23.427333 543705 disk_info.go:125] begin check local disk info of client
I0320 08:16:23.429795 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:16:23.429801 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fac80 0xc0001facc0]
E0320 08:16:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:16:33.409801 543705 memory.go:184] no items to output this cycle
I0320 08:16:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 08:16:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:16:43.409791 543705 memory.go:191] Add success.
I0320 08:16:43.409801 543705 cpu.go:282] Add success.
I0320 08:16:43.420119 543705 net.go:648] Add success.
I0320 08:16:43.422844 543705 net.go:770] primary dev: ETH0
I0320 08:16:43.422859 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:16:43.422872 543705 net.go:698] Add success.
I0320 08:16:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:16:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:16:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:16:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:16:53.409797 543705 memory.go:184] no items to output this cycle
I0320 08:16:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 08:17:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:17:03.409782 543705 memory.go:184] no items to output this cycle
I0320 08:17:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 08:17:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:17:13.409798 543705 memory.go:191] Add success.
I0320 08:17:13.409800 543705 cpu.go:282] Add success.
W0320 08:17:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:17:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:17:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:17:13.420205 543705 net.go:648] Add success.
I0320 08:17:13.423240 543705 net.go:770] primary dev: ETH0
I0320 08:17:13.423254 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:17:13.423265 543705 net.go:698] Add success.
I0320 08:17:13.452770 543705 event_worker.go:152] Polling the log file for events...
W0320 08:17:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:17:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0320 08:17:14.455170 543705 disk_worker.go:728] disk inode is not compliant
E0320 08:17:14.456964 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 08:17:14.456973 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 08:17:14.456980 543705 custom_config.go:64] query custom config with name: gpu
I0320 08:17:14.457016 543705 disk_worker.go:494] system disk:vda1
I0320 08:17:14.457045 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 08:17:15.456832 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 08:17:15.456841 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:17:16.457898 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 08:17:16.457899 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 08:17:16.457951 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:17:16.457970 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:17:16.472306 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:17:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:17:23.409768 543705 memory.go:184] no items to output this cycle
I0320 08:17:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 08:17:23.430318 543705 disk_info.go:125] begin check local disk info of client
I0320 08:17:23.432695 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:17:23.432700 543705 disk_info.go:196] parse disk info done, disk is : [0xc000534d80 0xc000534dc0]
E0320 08:17:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:17:33.409807 543705 memory.go:184] no items to output this cycle
I0320 08:17:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 08:17:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:17:43.409783 543705 memory.go:191] Add success.
I0320 08:17:43.409804 543705 cpu.go:282] Add success.
I0320 08:17:43.419983 543705 net.go:648] Add success.
I0320 08:17:43.422991 543705 net.go:770] primary dev: ETH0
I0320 08:17:43.423004 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:17:43.423016 543705 net.go:698] Add success.
I0320 08:17:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:17:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:17:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:17:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:17:53.409775 543705 memory.go:184] no items to output this cycle
I0320 08:17:53.409778 543705 cpu.go:275] no items to output this cycle
E0320 08:18:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:18:03.409805 543705 memory.go:184] no items to output this cycle
I0320 08:18:03.409815 543705 cpu.go:275] no items to output this cycle
E0320 08:18:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:18:13.409784 543705 memory.go:191] Add success.
I0320 08:18:13.409809 543705 cpu.go:282] Add success.
W0320 08:18:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:18:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:18:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:18:13.420104 543705 net.go:648] Add success.
I0320 08:18:13.423053 543705 net.go:770] primary dev: ETH0
I0320 08:18:13.423065 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:18:13.423096 543705 net.go:698] Add success.
I0320 08:18:13.467910 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"62f06532-8033-4c31-991e-58b19179ccd5","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 08:18:13.467951 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 08:18:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:18:14.455111 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:18:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 08:18:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:18:14.456585 543705 disk_worker.go:494] system disk:vda1
I0320 08:18:14.456614 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:18:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:18:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:18:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:18:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:18:16.472384 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:18:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:18:23.409791 543705 memory.go:184] no items to output this cycle
I0320 08:18:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 08:18:23.433337 543705 disk_info.go:125] begin check local disk info of client
I0320 08:18:23.435745 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:18:23.435751 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001c6300 0xc0001c6340]
E0320 08:18:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:18:33.409809 543705 memory.go:184] no items to output this cycle
I0320 08:18:33.409821 543705 cpu.go:275] no items to output this cycle
I0320 08:18:38.317727 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 08:18:38.317734 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 08:18:43.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:18:43.410794 543705 memory.go:191] Add success.
I0320 08:18:43.409800 543705 cpu.go:282] Add success.
I0320 08:18:43.420520 543705 net.go:648] Add success.
I0320 08:18:43.423740 543705 net.go:770] primary dev: ETH0
I0320 08:18:43.423755 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:18:43.423770 543705 net.go:698] Add success.
I0320 08:18:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:18:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:18:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:18:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:18:53.409798 543705 memory.go:184] no items to output this cycle
I0320 08:18:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 08:19:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:19:03.409801 543705 memory.go:184] no items to output this cycle
I0320 08:19:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 08:19:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:19:13.409823 543705 memory.go:191] Add success.
I0320 08:19:13.409826 543705 cpu.go:282] Add success.
W0320 08:19:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:19:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:19:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:19:13.419747 543705 net.go:648] Add success.
I0320 08:19:13.422648 543705 net.go:770] primary dev: ETH0
I0320 08:19:13.422663 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:19:13.422676 543705 net.go:698] Add success.
I0320 08:19:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:19:14.455143 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:19:14.455157 543705 disk_worker.go:708] disk space is not compliant
W0320 08:19:14.455159 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:19:14.456483 543705 disk_worker.go:494] system disk:vda1
I0320 08:19:14.456526 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:19:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:19:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:19:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:19:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:19:16.472424 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:19:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:19:23.409774 543705 memory.go:184] no items to output this cycle
I0320 08:19:23.409802 543705 cpu.go:275] no items to output this cycle
I0320 08:19:23.436367 543705 disk_info.go:125] begin check local disk info of client
I0320 08:19:23.438809 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:19:23.438815 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb4c0 0xc0001fb500]
E0320 08:19:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:19:33.409810 543705 memory.go:184] no items to output this cycle
I0320 08:19:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 08:19:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:19:43.409780 543705 memory.go:191] Add success.
I0320 08:19:43.409800 543705 cpu.go:282] Add success.
I0320 08:19:43.419865 543705 net.go:648] Add success.
I0320 08:19:43.422644 543705 net.go:770] primary dev: ETH0
I0320 08:19:43.422657 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:19:43.422670 543705 net.go:698] Add success.
I0320 08:19:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:19:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:19:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:19:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:19:53.409767 543705 memory.go:184] no items to output this cycle
I0320 08:19:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 08:20:03.409850 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:20:03.409870 543705 cpu.go:275] no items to output this cycle
I0320 08:20:03.409874 543705 memory.go:184] no items to output this cycle
E0320 08:20:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:20:13.409797 543705 memory.go:191] Add success.
I0320 08:20:13.409808 543705 cpu.go:282] Add success.
W0320 08:20:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:20:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:20:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:20:13.419750 543705 net.go:648] Add success.
I0320 08:20:13.422773 543705 net.go:770] primary dev: ETH0
I0320 08:20:13.422785 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:20:13.422797 543705 net.go:698] Add success.
I0320 08:20:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:20:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:20:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0320 08:20:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:20:14.456512 543705 disk_worker.go:494] system disk:vda1
I0320 08:20:14.456555 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:20:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:20:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:20:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:20:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:20:16.472367 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:20:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:20:23.409777 543705 memory.go:184] no items to output this cycle
I0320 08:20:23.409778 543705 cpu.go:275] no items to output this cycle
I0320 08:20:23.439350 543705 disk_info.go:125] begin check local disk info of client
I0320 08:20:23.441845 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:20:23.441850 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fbe40 0xc0001fbe80]
E0320 08:20:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:20:33.409806 543705 memory.go:184] no items to output this cycle
I0320 08:20:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 08:20:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:20:43.409796 543705 memory.go:191] Add success.
I0320 08:20:43.409797 543705 cpu.go:282] Add success.
I0320 08:20:43.419854 543705 net.go:648] Add success.
I0320 08:20:43.422731 543705 net.go:770] primary dev: ETH0
I0320 08:20:43.422744 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:20:43.422756 543705 net.go:698] Add success.
I0320 08:20:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:20:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:20:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:20:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:20:53.409781 543705 memory.go:184] no items to output this cycle
I0320 08:20:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 08:21:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:21:03.409793 543705 memory.go:184] no items to output this cycle
I0320 08:21:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 08:21:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:21:13.409811 543705 memory.go:191] Add success.
I0320 08:21:13.409812 543705 cpu.go:282] Add success.
W0320 08:21:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:21:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:21:13.409856 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:21:13.419737 543705 net.go:648] Add success.
I0320 08:21:13.422108 543705 net.go:770] primary dev: ETH0
I0320 08:21:13.422124 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:21:13.422137 543705 net.go:698] Add success.
I0320 08:21:13.786768 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"98e94087-895f-4fe9-968a-94ff8e0edb74","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 08:21:13.786805 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 08:21:14.454090 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:21:14.454243 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:21:14.454334 543705 disk_worker.go:708] disk space is not compliant
W0320 08:21:14.454337 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:21:14.455887 543705 disk_worker.go:494] system disk:vda1
I0320 08:21:14.455918 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:21:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:21:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:21:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:21:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:21:16.472350 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:21:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:21:23.409776 543705 memory.go:184] no items to output this cycle
I0320 08:21:23.409801 543705 cpu.go:275] no items to output this cycle
I0320 08:21:23.442393 543705 disk_info.go:125] begin check local disk info of client
I0320 08:21:23.444838 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:21:23.444843 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb340 0xc0001fb380]
E0320 08:21:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:21:33.409794 543705 memory.go:184] no items to output this cycle
I0320 08:21:33.409806 543705 cpu.go:275] no items to output this cycle
I0320 08:21:38.321741 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 08:21:38.321747 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 08:21:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:21:43.411015 543705 memory.go:191] Add success.
I0320 08:21:43.409818 543705 cpu.go:282] Add success.
I0320 08:21:43.419717 543705 net.go:648] Add success.
I0320 08:21:43.423172 543705 net.go:770] primary dev: ETH0
I0320 08:21:43.423185 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:21:43.423198 543705 net.go:698] Add success.
I0320 08:21:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:21:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:21:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:21:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:21:53.409804 543705 memory.go:184] no items to output this cycle
I0320 08:21:53.409814 543705 cpu.go:275] no items to output this cycle
E0320 08:22:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:22:03.409793 543705 memory.go:184] no items to output this cycle
I0320 08:22:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 08:22:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:22:13.409884 543705 memory.go:191] Add success.
W0320 08:22:13.409919 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:22:13.409934 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:22:13.409937 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:22:13.409950 543705 cpu.go:282] Add success.
I0320 08:22:13.419754 543705 net.go:648] Add success.
I0320 08:22:13.422503 543705 net.go:770] primary dev: ETH0
I0320 08:22:13.422516 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:22:13.422527 543705 net.go:698] Add success.
W0320 08:22:14.455164 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:22:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0320 08:22:14.455178 543705 disk_worker.go:728] disk inode is not compliant
E0320 08:22:14.456981 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 08:22:14.456990 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 08:22:14.456996 543705 custom_config.go:64] query custom config with name: gpu
I0320 08:22:14.457016 543705 disk_worker.go:494] system disk:vda1
I0320 08:22:14.457045 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 08:22:15.456813 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 08:22:15.456821 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:22:16.457911 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 08:22:16.457911 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 08:22:16.457964 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:22:16.457983 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:22:16.472370 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:22:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:22:23.409783 543705 memory.go:184] no items to output this cycle
I0320 08:22:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 08:22:23.445394 543705 disk_info.go:125] begin check local disk info of client
I0320 08:22:23.447898 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:22:23.447903 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb040 0xc0001fb080]
E0320 08:22:33.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:22:33.409815 543705 memory.go:184] no items to output this cycle
I0320 08:22:33.409827 543705 cpu.go:275] no items to output this cycle
E0320 08:22:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:22:43.409800 543705 memory.go:191] Add success.
I0320 08:22:43.409801 543705 cpu.go:282] Add success.
I0320 08:22:43.419869 543705 net.go:648] Add success.
I0320 08:22:43.422559 543705 net.go:770] primary dev: ETH0
I0320 08:22:43.422574 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:22:43.422588 543705 net.go:698] Add success.
I0320 08:22:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:22:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:22:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:22:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:22:53.409776 543705 memory.go:184] no items to output this cycle
I0320 08:22:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 08:23:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:23:03.409811 543705 memory.go:184] no items to output this cycle
I0320 08:23:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 08:23:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:23:13.409782 543705 memory.go:191] Add success.
W0320 08:23:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 08:23:13.409823 543705 cpu.go:282] Add success.
W0320 08:23:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:23:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:23:13.420299 543705 net.go:648] Add success.
I0320 08:23:13.423335 543705 net.go:770] primary dev: ETH0
I0320 08:23:13.423347 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:23:13.423358 543705 net.go:698] Add success.
I0320 08:23:14.454951 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:23:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:23:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 08:23:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:23:14.456587 543705 disk_worker.go:494] system disk:vda1
I0320 08:23:14.456615 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:23:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:23:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:23:16.458021 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:23:16.458044 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:23:16.472361 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:23:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:23:23.409795 543705 memory.go:184] no items to output this cycle
I0320 08:23:23.409807 543705 cpu.go:275] no items to output this cycle
I0320 08:23:23.448411 543705 disk_info.go:125] begin check local disk info of client
I0320 08:23:23.450806 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:23:23.450811 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b3700 0xc0002b3740]
E0320 08:23:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:23:33.409792 543705 memory.go:184] no items to output this cycle
I0320 08:23:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 08:23:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:23:43.409819 543705 memory.go:191] Add success.
I0320 08:23:43.409822 543705 cpu.go:282] Add success.
I0320 08:23:43.419968 543705 net.go:648] Add success.
I0320 08:23:43.422753 543705 net.go:770] primary dev: ETH0
I0320 08:23:43.422768 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:23:43.422783 543705 net.go:698] Add success.
I0320 08:23:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:23:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:23:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:23:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:23:53.409783 543705 memory.go:184] no items to output this cycle
I0320 08:23:53.409784 543705 cpu.go:275] no items to output this cycle
E0320 08:24:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:24:03.409768 543705 memory.go:184] no items to output this cycle
I0320 08:24:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 08:24:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:24:13.409805 543705 memory.go:191] Add success.
I0320 08:24:13.409808 543705 cpu.go:282] Add success.
W0320 08:24:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:24:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:24:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:24:13.420111 543705 net.go:648] Add success.
I0320 08:24:13.423224 543705 net.go:770] primary dev: ETH0
I0320 08:24:13.423239 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:24:13.423250 543705 net.go:698] Add success.
I0320 08:24:13.468525 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"bce40aad-b02d-4403-9a7f-cdaced317a59","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 08:24:13.468574 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 08:24:14.454063 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:24:14.454254 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:24:14.454265 543705 disk_worker.go:708] disk space is not compliant
W0320 08:24:14.454268 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:24:14.455623 543705 disk_worker.go:494] system disk:vda1
I0320 08:24:14.455666 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:24:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:24:16.457962 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:24:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:24:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:24:16.472382 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:24:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:24:23.409797 543705 memory.go:184] no items to output this cycle
I0320 08:24:23.409808 543705 cpu.go:275] no items to output this cycle
I0320 08:24:23.451430 543705 disk_info.go:125] begin check local disk info of client
I0320 08:24:23.453936 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:24:23.453942 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb880 0xc0001fb8c0]
E0320 08:24:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:24:33.409802 543705 memory.go:184] no items to output this cycle
I0320 08:24:33.409813 543705 cpu.go:275] no items to output this cycle
I0320 08:24:38.325737 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 08:24:38.325744 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 08:24:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:24:43.410797 543705 memory.go:191] Add success.
I0320 08:24:43.409807 543705 cpu.go:282] Add success.
I0320 08:24:43.420559 543705 net.go:648] Add success.
I0320 08:24:43.423570 543705 net.go:770] primary dev: ETH0
I0320 08:24:43.423583 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:24:43.423596 543705 net.go:698] Add success.
I0320 08:24:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:24:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:24:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:24:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:24:53.409772 543705 memory.go:184] no items to output this cycle
I0320 08:24:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 08:25:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:25:03.409766 543705 memory.go:184] no items to output this cycle
I0320 08:25:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 08:25:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:25:13.409818 543705 memory.go:191] Add success.
I0320 08:25:13.409824 543705 cpu.go:282] Add success.
W0320 08:25:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:25:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:25:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:25:13.420324 543705 net.go:648] Add success.
I0320 08:25:13.423202 543705 net.go:770] primary dev: ETH0
I0320 08:25:13.423216 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:25:13.423232 543705 net.go:698] Add success.
I0320 08:25:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:25:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:25:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0320 08:25:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:25:14.456571 543705 disk_worker.go:494] system disk:vda1
I0320 08:25:14.456617 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:25:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:25:16.458005 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:25:16.458068 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:25:16.458091 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:25:16.472401 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:25:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:25:23.409764 543705 memory.go:184] no items to output this cycle
I0320 08:25:23.409798 543705 cpu.go:275] no items to output this cycle
I0320 08:25:23.454469 543705 disk_info.go:125] begin check local disk info of client
I0320 08:25:23.456844 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:25:23.456849 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004fc180 0xc0004fc1c0]
E0320 08:25:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:25:33.409790 543705 cpu.go:275] no items to output this cycle
I0320 08:25:33.409793 543705 memory.go:184] no items to output this cycle
E0320 08:25:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:25:43.409780 543705 memory.go:191] Add success.
I0320 08:25:43.409796 543705 cpu.go:282] Add success.
I0320 08:25:43.419897 543705 net.go:648] Add success.
I0320 08:25:43.420849 543705 net.go:770] primary dev: ETH0
I0320 08:25:43.420864 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:25:43.420879 543705 net.go:698] Add success.
I0320 08:25:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:25:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:25:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:25:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:25:53.409789 543705 memory.go:184] no items to output this cycle
I0320 08:25:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 08:26:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:26:03.409778 543705 memory.go:184] no items to output this cycle
I0320 08:26:03.409784 543705 cpu.go:275] no items to output this cycle
E0320 08:26:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:26:13.409799 543705 memory.go:191] Add success.
I0320 08:26:13.409800 543705 cpu.go:282] Add success.
W0320 08:26:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:26:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:26:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:26:13.420194 543705 net.go:648] Add success.
I0320 08:26:13.422887 543705 net.go:770] primary dev: ETH0
I0320 08:26:13.422902 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:26:13.422916 543705 net.go:698] Add success.
I0320 08:26:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:26:14.455200 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:26:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0320 08:26:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:26:14.456598 543705 disk_worker.go:494] system disk:vda1
I0320 08:26:14.456632 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:26:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:26:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:26:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:26:16.458163 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:26:16.472090 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:26:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:26:23.409782 543705 cpu.go:275] no items to output this cycle
I0320 08:26:23.409792 543705 memory.go:184] no items to output this cycle
I0320 08:26:23.457483 543705 disk_info.go:125] begin check local disk info of client
I0320 08:26:23.459916 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:26:23.459921 543705 disk_info.go:196] parse disk info done, disk is : [0xc000327040 0xc000327080]
E0320 08:26:33.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:26:33.409812 543705 memory.go:184] no items to output this cycle
I0320 08:26:33.409826 543705 cpu.go:275] no items to output this cycle
E0320 08:26:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:26:43.409784 543705 memory.go:191] Add success.
I0320 08:26:43.409795 543705 cpu.go:282] Add success.
I0320 08:26:43.419858 543705 net.go:648] Add success.
I0320 08:26:43.422628 543705 net.go:770] primary dev: ETH0
I0320 08:26:43.422640 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:26:43.422652 543705 net.go:698] Add success.
I0320 08:26:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:26:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:26:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:26:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:26:53.409766 543705 memory.go:184] no items to output this cycle
I0320 08:26:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 08:27:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:27:03.409796 543705 memory.go:184] no items to output this cycle
I0320 08:27:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 08:27:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:27:13.409781 543705 memory.go:191] Add success.
W0320 08:27:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 08:27:13.409810 543705 cpu.go:282] Add success.
W0320 08:27:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:27:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:27:13.420064 543705 net.go:648] Add success.
I0320 08:27:13.422787 543705 net.go:770] primary dev: ETH0
I0320 08:27:13.422807 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:27:13.422822 543705 net.go:698] Add success.
I0320 08:27:13.429434 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 08:27:13.453610 543705 event_worker.go:152] Polling the log file for events...
I0320 08:27:13.463832 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9e6729f9-ec79-4f51-ad82-99ef5fd342bc","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 08:27:13.463868 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 08:27:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:27:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 08:27:14.455188 543705 disk_worker.go:728] disk inode is not compliant
E0320 08:27:14.455806 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 08:27:14.455813 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 08:27:14.455817 543705 custom_config.go:64] query custom config with name: gpu
I0320 08:27:14.456935 543705 disk_worker.go:494] system disk:vda1
I0320 08:27:14.456979 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 08:27:15.456836 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 08:27:15.456845 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:27:16.457925 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 08:27:16.457925 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 08:27:16.457982 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:27:16.458000 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:27:16.472331 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:27:23.409861 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:27:23.409878 543705 memory.go:184] no items to output this cycle
I0320 08:27:23.409923 543705 cpu.go:275] no items to output this cycle
I0320 08:27:23.460911 543705 disk_info.go:125] begin check local disk info of client
I0320 08:27:23.463353 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:27:23.463358 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048b1c0 0xc00048b200]
E0320 08:27:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:27:33.409783 543705 memory.go:184] no items to output this cycle
I0320 08:27:33.409799 543705 cpu.go:275] no items to output this cycle
I0320 08:27:38.329737 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 08:27:38.329744 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 08:27:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:27:43.410516 543705 memory.go:191] Add success.
I0320 08:27:43.409803 543705 cpu.go:282] Add success.
I0320 08:27:43.420216 543705 net.go:648] Add success.
I0320 08:27:43.422752 543705 net.go:770] primary dev: ETH0
I0320 08:27:43.422765 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:27:43.422778 543705 net.go:698] Add success.
I0320 08:27:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:27:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:27:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:27:53.410273 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:27:53.410294 543705 memory.go:184] no items to output this cycle
I0320 08:27:53.410297 543705 cpu.go:275] no items to output this cycle
E0320 08:28:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:28:03.409766 543705 memory.go:184] no items to output this cycle
I0320 08:28:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 08:28:13.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:28:13.409830 543705 memory.go:191] Add success.
I0320 08:28:13.409833 543705 cpu.go:282] Add success.
W0320 08:28:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:28:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:28:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:28:13.420195 543705 net.go:648] Add success.
I0320 08:28:13.423213 543705 net.go:770] primary dev: ETH0
I0320 08:28:13.423226 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:28:13.423238 543705 net.go:698] Add success.
I0320 08:28:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:28:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:28:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0320 08:28:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:28:14.456498 543705 disk_worker.go:494] system disk:vda1
I0320 08:28:14.456542 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:28:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:28:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:28:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:28:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:28:16.472371 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:28:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:28:23.409775 543705 memory.go:184] no items to output this cycle
I0320 08:28:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 08:28:23.463428 543705 disk_info.go:125] begin check local disk info of client
I0320 08:28:23.465902 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:28:23.465908 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004b2c80 0xc0004b2cc0]
E0320 08:28:33.409859 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:28:33.409884 543705 memory.go:184] no items to output this cycle
I0320 08:28:33.410010 543705 cpu.go:275] no items to output this cycle
E0320 08:28:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:28:43.409801 543705 cpu.go:282] Add success.
I0320 08:28:43.409806 543705 memory.go:191] Add success.
I0320 08:28:43.419880 543705 net.go:648] Add success.
I0320 08:28:43.422777 543705 net.go:770] primary dev: ETH0
I0320 08:28:43.422789 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:28:43.422801 543705 net.go:698] Add success.
I0320 08:28:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:28:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:28:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:28:53.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:28:53.409824 543705 memory.go:184] no items to output this cycle
I0320 08:28:53.409839 543705 cpu.go:275] no items to output this cycle
E0320 08:29:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:29:03.409781 543705 cpu.go:275] no items to output this cycle
I0320 08:29:03.409784 543705 memory.go:184] no items to output this cycle
E0320 08:29:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:29:13.409809 543705 memory.go:191] Add success.
I0320 08:29:13.409808 543705 cpu.go:282] Add success.
W0320 08:29:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:29:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:29:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:29:13.420124 543705 net.go:648] Add success.
I0320 08:29:13.422889 543705 net.go:770] primary dev: ETH0
I0320 08:29:13.422903 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:29:13.422915 543705 net.go:698] Add success.
I0320 08:29:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:29:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:29:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0320 08:29:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:29:14.456511 543705 disk_worker.go:494] system disk:vda1
I0320 08:29:14.456553 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:29:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:29:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:29:16.458024 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:29:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:29:16.472385 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:29:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:29:23.409779 543705 memory.go:184] no items to output this cycle
I0320 08:29:23.409786 543705 cpu.go:275] no items to output this cycle
I0320 08:29:23.466036 543705 disk_info.go:125] begin check local disk info of client
I0320 08:29:23.468516 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:29:23.468523 543705 disk_info.go:196] parse disk info done, disk is : [0xc000536000 0xc000536040]
E0320 08:29:33.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:29:33.409815 543705 memory.go:184] no items to output this cycle
I0320 08:29:33.409830 543705 cpu.go:275] no items to output this cycle
E0320 08:29:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:29:43.409786 543705 memory.go:191] Add success.
I0320 08:29:43.409820 543705 cpu.go:282] Add success.
I0320 08:29:43.419858 543705 net.go:648] Add success.
I0320 08:29:43.422667 543705 net.go:770] primary dev: ETH0
I0320 08:29:43.422681 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:29:43.422694 543705 net.go:698] Add success.
I0320 08:29:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:29:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:29:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:29:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:29:53.409769 543705 memory.go:184] no items to output this cycle
I0320 08:29:53.409784 543705 cpu.go:275] no items to output this cycle
E0320 08:30:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:30:03.409783 543705 memory.go:184] no items to output this cycle
I0320 08:30:03.409784 543705 cpu.go:275] no items to output this cycle
E0320 08:30:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:30:13.409823 543705 memory.go:191] Add success.
I0320 08:30:13.409828 543705 cpu.go:282] Add success.
W0320 08:30:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:30:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:30:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:30:13.420121 543705 net.go:648] Add success.
I0320 08:30:13.423433 543705 net.go:770] primary dev: ETH0
I0320 08:30:13.423446 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:30:13.423457 543705 net.go:698] Add success.
I0320 08:30:13.468048 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0f67d5fe-da0e-46ef-97e2-8921a0203b9f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 08:30:13.468083 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 08:30:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:30:14.455143 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:30:14.455153 543705 disk_worker.go:708] disk space is not compliant
W0320 08:30:14.455156 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:30:14.456525 543705 disk_worker.go:494] system disk:vda1
I0320 08:30:14.456575 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:30:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:30:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:30:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:30:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:30:16.472393 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:30:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:30:23.409762 543705 memory.go:184] no items to output this cycle
I0320 08:30:23.409797 543705 cpu.go:275] no items to output this cycle
I0320 08:30:23.469537 543705 disk_info.go:125] begin check local disk info of client
I0320 08:30:23.471943 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:30:23.471949 543705 disk_info.go:196] parse disk info done, disk is : [0xc000327200 0xc000327240]
E0320 08:30:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:30:33.409799 543705 memory.go:184] no items to output this cycle
I0320 08:30:33.409813 543705 cpu.go:275] no items to output this cycle
I0320 08:30:38.333730 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 08:30:38.333737 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 08:30:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:30:43.410868 543705 memory.go:191] Add success.
I0320 08:30:43.409795 543705 cpu.go:282] Add success.
I0320 08:30:43.420578 543705 net.go:648] Add success.
I0320 08:30:43.423920 543705 net.go:770] primary dev: ETH0
I0320 08:30:43.423933 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:30:43.423945 543705 net.go:698] Add success.
I0320 08:30:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:30:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:30:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:30:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:30:53.409776 543705 memory.go:184] no items to output this cycle
I0320 08:30:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 08:31:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:31:03.409776 543705 memory.go:184] no items to output this cycle
I0320 08:31:03.409782 543705 cpu.go:275] no items to output this cycle
E0320 08:31:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:31:13.409794 543705 memory.go:191] Add success.
I0320 08:31:13.409814 543705 cpu.go:282] Add success.
W0320 08:31:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:31:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:31:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:31:13.420143 543705 net.go:648] Add success.
I0320 08:31:13.423392 543705 net.go:770] primary dev: ETH0
I0320 08:31:13.423405 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:31:13.423416 543705 net.go:698] Add success.
I0320 08:31:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:31:14.455203 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:31:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0320 08:31:14.455216 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:31:14.456613 543705 disk_worker.go:494] system disk:vda1
I0320 08:31:14.456729 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:31:15.455949 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:31:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:31:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:31:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:31:16.472395 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:31:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:31:23.409794 543705 memory.go:184] no items to output this cycle
I0320 08:31:23.409806 543705 cpu.go:275] no items to output this cycle
I0320 08:31:23.472541 543705 disk_info.go:125] begin check local disk info of client
I0320 08:31:23.474956 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:31:23.474961 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b4c0 0xc00007b500]
E0320 08:31:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:31:33.409776 543705 memory.go:184] no items to output this cycle
I0320 08:31:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 08:31:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:31:43.409811 543705 memory.go:191] Add success.
I0320 08:31:43.409822 543705 cpu.go:282] Add success.
I0320 08:31:43.419871 543705 net.go:648] Add success.
I0320 08:31:43.422593 543705 net.go:770] primary dev: ETH0
I0320 08:31:43.422607 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:31:43.422623 543705 net.go:698] Add success.
I0320 08:31:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:31:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:31:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:31:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:31:53.409797 543705 memory.go:184] no items to output this cycle
I0320 08:31:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 08:32:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:32:03.409774 543705 memory.go:184] no items to output this cycle
I0320 08:32:03.409779 543705 cpu.go:275] no items to output this cycle
E0320 08:32:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:32:13.409821 543705 memory.go:191] Add success.
I0320 08:32:13.409830 543705 cpu.go:282] Add success.
W0320 08:32:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:32:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:32:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:32:13.420215 543705 net.go:648] Add success.
I0320 08:32:13.423674 543705 net.go:770] primary dev: ETH0
I0320 08:32:13.423687 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:32:13.423704 543705 net.go:698] Add success.
W0320 08:32:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:32:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0320 08:32:14.455197 543705 disk_worker.go:728] disk inode is not compliant
E0320 08:32:14.456044 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 08:32:14.456054 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 08:32:14.456060 543705 custom_config.go:64] query custom config with name: gpu
I0320 08:32:14.456868 543705 disk_worker.go:494] system disk:vda1
I0320 08:32:14.456908 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 08:32:15.456805 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 08:32:15.456813 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:32:16.457956 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 08:32:16.457955 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 08:32:16.458010 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:32:16.458030 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:32:16.472376 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:32:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:32:23.409776 543705 memory.go:184] no items to output this cycle
I0320 08:32:23.409778 543705 cpu.go:275] no items to output this cycle
I0320 08:32:23.475529 543705 disk_info.go:125] begin check local disk info of client
I0320 08:32:23.477987 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:32:23.477993 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4240 0xc0000c4280]
E0320 08:32:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:32:33.409804 543705 memory.go:184] no items to output this cycle
I0320 08:32:33.409817 543705 cpu.go:275] no items to output this cycle
E0320 08:32:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:32:43.409785 543705 memory.go:191] Add success.
I0320 08:32:43.409815 543705 cpu.go:282] Add success.
I0320 08:32:43.419873 543705 net.go:648] Add success.
I0320 08:32:43.422551 543705 net.go:770] primary dev: ETH0
I0320 08:32:43.422564 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:32:43.422576 543705 net.go:698] Add success.
I0320 08:32:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:32:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:32:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:32:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:32:53.409782 543705 memory.go:184] no items to output this cycle
I0320 08:32:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 08:33:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:33:03.409793 543705 memory.go:184] no items to output this cycle
I0320 08:33:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 08:33:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:33:13.409803 543705 memory.go:191] Add success.
I0320 08:33:13.409803 543705 cpu.go:282] Add success.
W0320 08:33:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:33:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:33:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:33:13.420216 543705 net.go:648] Add success.
I0320 08:33:13.422990 543705 net.go:770] primary dev: ETH0
I0320 08:33:13.423005 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:33:13.423019 543705 net.go:698] Add success.
I0320 08:33:13.465780 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"db7c197f-d7f6-48ce-b934-127d5160d1f1","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 08:33:13.465822 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 08:33:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:33:14.455151 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:33:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0320 08:33:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:33:14.456523 543705 disk_worker.go:494] system disk:vda1
I0320 08:33:14.456577 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:33:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:33:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:33:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:33:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:33:16.472437 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:33:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:33:23.409769 543705 memory.go:184] no items to output this cycle
I0320 08:33:23.409777 543705 cpu.go:275] no items to output this cycle
I0320 08:33:23.478631 543705 disk_info.go:125] begin check local disk info of client
I0320 08:33:23.481046 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:33:23.481051 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4300 0xc0000c4340]
E0320 08:33:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:33:33.409783 543705 cpu.go:275] no items to output this cycle
I0320 08:33:33.409794 543705 memory.go:184] no items to output this cycle
I0320 08:33:38.337730 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 08:33:38.337736 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 08:33:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:33:43.410560 543705 memory.go:191] Add success.
I0320 08:33:43.409806 543705 cpu.go:282] Add success.
I0320 08:33:43.420260 543705 net.go:648] Add success.
I0320 08:33:43.422937 543705 net.go:770] primary dev: ETH0
I0320 08:33:43.422951 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:33:43.422963 543705 net.go:698] Add success.
I0320 08:33:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:33:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:33:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:33:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:33:53.409795 543705 memory.go:184] no items to output this cycle
I0320 08:33:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 08:34:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:34:03.409772 543705 memory.go:184] no items to output this cycle
I0320 08:34:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 08:34:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:34:13.409802 543705 memory.go:191] Add success.
I0320 08:34:13.409816 543705 cpu.go:282] Add success.
W0320 08:34:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:34:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:34:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:34:13.419714 543705 net.go:648] Add success.
I0320 08:34:13.422661 543705 net.go:770] primary dev: ETH0
I0320 08:34:13.422678 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:34:13.422697 543705 net.go:698] Add success.
I0320 08:34:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:34:14.455145 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:34:14.455156 543705 disk_worker.go:708] disk space is not compliant
W0320 08:34:14.455159 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:34:14.456521 543705 disk_worker.go:494] system disk:vda1
I0320 08:34:14.456567 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:34:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:34:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:34:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:34:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:34:16.472380 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:34:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:34:23.409772 543705 cpu.go:275] no items to output this cycle
I0320 08:34:23.409783 543705 memory.go:184] no items to output this cycle
I0320 08:34:23.481569 543705 disk_info.go:125] begin check local disk info of client
I0320 08:34:23.484030 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:34:23.484035 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c51c0 0xc0000c5200]
E0320 08:34:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:34:33.409803 543705 cpu.go:275] no items to output this cycle
I0320 08:34:33.409805 543705 memory.go:184] no items to output this cycle
E0320 08:34:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:34:43.409793 543705 memory.go:191] Add success.
I0320 08:34:43.409809 543705 cpu.go:282] Add success.
I0320 08:34:43.419973 543705 net.go:648] Add success.
I0320 08:34:43.422788 543705 net.go:770] primary dev: ETH0
I0320 08:34:43.422804 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:34:43.422826 543705 net.go:698] Add success.
I0320 08:34:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:34:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:34:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:34:53.410271 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:34:53.410291 543705 memory.go:184] no items to output this cycle
I0320 08:34:53.410304 543705 cpu.go:275] no items to output this cycle
E0320 08:35:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:35:03.409766 543705 memory.go:184] no items to output this cycle
I0320 08:35:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 08:35:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:35:13.409820 543705 memory.go:191] Add success.
I0320 08:35:13.409827 543705 cpu.go:282] Add success.
W0320 08:35:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:35:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:35:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:35:13.420223 543705 net.go:648] Add success.
I0320 08:35:13.422816 543705 net.go:770] primary dev: ETH0
I0320 08:35:13.422829 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:35:13.422849 543705 net.go:698] Add success.
I0320 08:35:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:35:14.455133 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:35:14.455143 543705 disk_worker.go:708] disk space is not compliant
W0320 08:35:14.455146 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:35:14.456496 543705 disk_worker.go:494] system disk:vda1
I0320 08:35:14.456539 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:35:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:35:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:35:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:35:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:35:16.472368 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:35:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:35:23.409765 543705 memory.go:184] no items to output this cycle
I0320 08:35:23.409798 543705 cpu.go:275] no items to output this cycle
I0320 08:35:23.484648 543705 disk_info.go:125] begin check local disk info of client
I0320 08:35:23.487095 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:35:23.487101 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab1c0 0xc0001ab200]
E0320 08:35:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:35:33.409776 543705 memory.go:184] no items to output this cycle
I0320 08:35:33.409798 543705 cpu.go:275] no items to output this cycle
E0320 08:35:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:35:43.409790 543705 memory.go:191] Add success.
I0320 08:35:43.409791 543705 cpu.go:282] Add success.
I0320 08:35:43.419871 543705 net.go:648] Add success.
I0320 08:35:43.422542 543705 net.go:770] primary dev: ETH0
I0320 08:35:43.422557 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:35:43.422572 543705 net.go:698] Add success.
I0320 08:35:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:35:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:35:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:35:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:35:53.409774 543705 memory.go:184] no items to output this cycle
I0320 08:35:53.409777 543705 cpu.go:275] no items to output this cycle
E0320 08:36:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:36:03.409769 543705 memory.go:184] no items to output this cycle
I0320 08:36:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 08:36:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:36:13.409822 543705 memory.go:191] Add success.
I0320 08:36:13.409832 543705 cpu.go:282] Add success.
W0320 08:36:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:36:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:36:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:36:13.420542 543705 net.go:648] Add success.
I0320 08:36:13.423167 543705 net.go:770] primary dev: ETH0
I0320 08:36:13.423179 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:36:13.423190 543705 net.go:698] Add success.
I0320 08:36:13.464119 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1870935d-b0f0-4291-895d-f039509879ea","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 08:36:13.464154 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 08:36:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:36:14.455356 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:36:14.455369 543705 disk_worker.go:708] disk space is not compliant
W0320 08:36:14.455373 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:36:14.457025 543705 disk_worker.go:494] system disk:vda1
I0320 08:36:14.457054 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:36:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:36:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:36:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:36:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:36:16.472390 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:36:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:36:23.409777 543705 cpu.go:275] no items to output this cycle
I0320 08:36:23.409777 543705 memory.go:184] no items to output this cycle
I0320 08:36:23.487604 543705 disk_info.go:125] begin check local disk info of client
I0320 08:36:23.490017 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:36:23.490023 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001faa00 0xc0001faa40]
E0320 08:36:33.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:36:33.409814 543705 cpu.go:275] no items to output this cycle
I0320 08:36:33.409819 543705 memory.go:184] no items to output this cycle
I0320 08:36:38.341736 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 08:36:38.341743 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 08:36:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:36:43.410744 543705 memory.go:191] Add success.
I0320 08:36:43.409798 543705 cpu.go:282] Add success.
I0320 08:36:43.420458 543705 net.go:648] Add success.
I0320 08:36:43.423542 543705 net.go:770] primary dev: ETH0
I0320 08:36:43.423556 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:36:43.423569 543705 net.go:698] Add success.
I0320 08:36:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:36:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:36:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:36:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:36:53.409776 543705 cpu.go:275] no items to output this cycle
I0320 08:36:53.409779 543705 memory.go:184] no items to output this cycle
E0320 08:37:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:37:03.409774 543705 memory.go:184] no items to output this cycle
I0320 08:37:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 08:37:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:37:13.409801 543705 memory.go:191] Add success.
I0320 08:37:13.409803 543705 cpu.go:282] Add success.
W0320 08:37:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:37:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:37:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:37:13.420050 543705 net.go:648] Add success.
I0320 08:37:13.422674 543705 net.go:770] primary dev: ETH0
I0320 08:37:13.422687 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:37:13.422700 543705 net.go:698] Add success.
I0320 08:37:13.453317 543705 event_worker.go:152] Polling the log file for events...
W0320 08:37:14.455295 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:37:14.455309 543705 disk_worker.go:708] disk space is not compliant
W0320 08:37:14.455313 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:37:14.457391 543705 disk_worker.go:494] system disk:vda1
I0320 08:37:14.457432 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 08:37:14.457611 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 08:37:14.457618 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 08:37:14.457622 543705 custom_config.go:64] query custom config with name: gpu
E0320 08:37:15.456822 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 08:37:15.456832 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:37:16.457930 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 08:37:16.457941 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 08:37:16.457984 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:37:16.458001 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:37:16.472315 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:37:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:37:23.409765 543705 memory.go:184] no items to output this cycle
I0320 08:37:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 08:37:23.490631 543705 disk_info.go:125] begin check local disk info of client
I0320 08:37:23.493019 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:37:23.493024 543705 disk_info.go:196] parse disk info done, disk is : [0xc000492840 0xc000492880]
E0320 08:37:33.410694 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:37:33.410711 543705 memory.go:184] no items to output this cycle
I0320 08:37:33.410718 543705 cpu.go:275] no items to output this cycle
E0320 08:37:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:37:43.409788 543705 cpu.go:282] Add success.
I0320 08:37:43.409793 543705 memory.go:191] Add success.
I0320 08:37:43.419841 543705 net.go:648] Add success.
I0320 08:37:43.422507 543705 net.go:770] primary dev: ETH0
I0320 08:37:43.422520 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:37:43.422533 543705 net.go:698] Add success.
I0320 08:37:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:37:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:37:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:37:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:37:53.409783 543705 memory.go:184] no items to output this cycle
I0320 08:37:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 08:38:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:38:03.409802 543705 memory.go:184] no items to output this cycle
I0320 08:38:03.409809 543705 cpu.go:275] no items to output this cycle
E0320 08:38:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:38:13.409804 543705 memory.go:191] Add success.
I0320 08:38:13.409807 543705 cpu.go:282] Add success.
W0320 08:38:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:38:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:38:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:38:13.420258 543705 net.go:648] Add success.
I0320 08:38:13.423419 543705 net.go:770] primary dev: ETH0
I0320 08:38:13.423435 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:38:13.423449 543705 net.go:698] Add success.
I0320 08:38:14.454985 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:38:14.455291 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:38:14.455487 543705 disk_worker.go:708] disk space is not compliant
W0320 08:38:14.455491 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:38:14.457234 543705 disk_worker.go:494] system disk:vda1
I0320 08:38:14.457265 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:38:15.455950 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:38:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:38:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:38:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:38:16.472401 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:38:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:38:23.409769 543705 memory.go:184] no items to output this cycle
I0320 08:38:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 08:38:23.493663 543705 disk_info.go:125] begin check local disk info of client
I0320 08:38:23.496060 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:38:23.496065 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aad40 0xc0001aad80]
E0320 08:38:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:38:33.409779 543705 memory.go:184] no items to output this cycle
I0320 08:38:33.409785 543705 cpu.go:275] no items to output this cycle
E0320 08:38:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:38:43.409795 543705 cpu.go:282] Add success.
I0320 08:38:43.409803 543705 memory.go:191] Add success.
I0320 08:38:43.420031 543705 net.go:648] Add success.
I0320 08:38:43.423025 543705 net.go:770] primary dev: ETH0
I0320 08:38:43.423040 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:38:43.423054 543705 net.go:698] Add success.
I0320 08:38:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:38:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:38:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:38:53.410334 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:38:53.410349 543705 memory.go:184] no items to output this cycle
I0320 08:38:53.410366 543705 cpu.go:275] no items to output this cycle
E0320 08:39:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:39:03.409811 543705 memory.go:184] no items to output this cycle
I0320 08:39:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 08:39:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:39:13.409797 543705 memory.go:191] Add success.
I0320 08:39:13.409818 543705 cpu.go:282] Add success.
W0320 08:39:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:39:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:39:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:39:13.420135 543705 net.go:648] Add success.
I0320 08:39:13.423160 543705 net.go:770] primary dev: ETH0
I0320 08:39:13.423173 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:39:13.423185 543705 net.go:698] Add success.
I0320 08:39:13.470124 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"082c9658-5c53-4cfb-a855-31823abe110b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 08:39:13.470156 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 08:39:14.454983 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:39:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:39:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0320 08:39:14.455170 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:39:14.456542 543705 disk_worker.go:494] system disk:vda1
I0320 08:39:14.456584 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:39:15.455612 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:39:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:39:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:39:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:39:16.472374 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:39:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:39:23.409786 543705 memory.go:184] no items to output this cycle
I0320 08:39:23.409788 543705 cpu.go:275] no items to output this cycle
I0320 08:39:23.496675 543705 disk_info.go:125] begin check local disk info of client
I0320 08:39:23.499085 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:39:23.499090 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab4c0 0xc0001ab500]
E0320 08:39:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:39:33.409784 543705 memory.go:184] no items to output this cycle
I0320 08:39:33.409807 543705 cpu.go:275] no items to output this cycle
I0320 08:39:38.345743 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 08:39:38.345750 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 08:39:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:39:43.410672 543705 memory.go:191] Add success.
I0320 08:39:43.409835 543705 cpu.go:282] Add success.
I0320 08:39:43.420488 543705 net.go:648] Add success.
I0320 08:39:43.424660 543705 net.go:770] primary dev: ETH0
I0320 08:39:43.424672 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:39:43.424686 543705 net.go:698] Add success.
I0320 08:39:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:39:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:39:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:39:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:39:53.409779 543705 memory.go:184] no items to output this cycle
I0320 08:39:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 08:40:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:40:03.409806 543705 memory.go:184] no items to output this cycle
I0320 08:40:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 08:40:13.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:40:13.409835 543705 memory.go:191] Add success.
I0320 08:40:13.409843 543705 cpu.go:282] Add success.
W0320 08:40:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:40:13.409884 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:40:13.409888 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:40:13.420415 543705 net.go:648] Add success.
I0320 08:40:13.423231 543705 net.go:770] primary dev: ETH0
I0320 08:40:13.423243 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:40:13.423254 543705 net.go:698] Add success.
I0320 08:40:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:40:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:40:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0320 08:40:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:40:14.456566 543705 disk_worker.go:494] system disk:vda1
I0320 08:40:14.456596 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:40:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:40:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:40:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:40:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:40:16.472379 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:40:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:40:23.409801 543705 memory.go:184] no items to output this cycle
I0320 08:40:23.409812 543705 cpu.go:275] no items to output this cycle
I0320 08:40:23.499664 543705 disk_info.go:125] begin check local disk info of client
I0320 08:40:23.502150 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:40:23.502156 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b9c0 0xc00007ba00]
E0320 08:40:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:40:33.409764 543705 memory.go:184] no items to output this cycle
I0320 08:40:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 08:40:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:40:43.409806 543705 memory.go:191] Add success.
I0320 08:40:43.409821 543705 cpu.go:282] Add success.
I0320 08:40:43.419898 543705 net.go:648] Add success.
I0320 08:40:43.422849 543705 net.go:770] primary dev: ETH0
I0320 08:40:43.422863 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:40:43.422876 543705 net.go:698] Add success.
I0320 08:40:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:40:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:40:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:40:53.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:40:53.409763 543705 memory.go:184] no items to output this cycle
I0320 08:40:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 08:41:03.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:41:03.409761 543705 memory.go:184] no items to output this cycle
I0320 08:41:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 08:41:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:41:13.409800 543705 memory.go:191] Add success.
I0320 08:41:13.409832 543705 cpu.go:282] Add success.
W0320 08:41:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:41:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:41:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:41:13.420172 543705 net.go:648] Add success.
I0320 08:41:13.423166 543705 net.go:770] primary dev: ETH0
I0320 08:41:13.423181 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:41:13.423194 543705 net.go:698] Add success.
I0320 08:41:14.453945 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:41:14.455147 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:41:14.455226 543705 disk_worker.go:708] disk space is not compliant
W0320 08:41:14.455230 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:41:14.456620 543705 disk_worker.go:494] system disk:vda1
I0320 08:41:14.456648 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:41:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:41:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:41:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:41:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:41:16.472359 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:41:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:41:23.409783 543705 memory.go:184] no items to output this cycle
I0320 08:41:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 08:41:23.502705 543705 disk_info.go:125] begin check local disk info of client
I0320 08:41:23.505090 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:41:23.505096 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa680 0xc0001aa6c0]
E0320 08:41:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:41:33.409780 543705 memory.go:184] no items to output this cycle
I0320 08:41:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 08:41:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:41:43.409783 543705 memory.go:191] Add success.
I0320 08:41:43.409803 543705 cpu.go:282] Add success.
I0320 08:41:43.419901 543705 net.go:648] Add success.
I0320 08:41:43.422813 543705 net.go:770] primary dev: ETH0
I0320 08:41:43.422826 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:41:43.422840 543705 net.go:698] Add success.
I0320 08:41:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:41:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:41:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:41:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:41:53.409776 543705 memory.go:184] no items to output this cycle
I0320 08:41:53.409779 543705 cpu.go:275] no items to output this cycle
E0320 08:42:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:42:03.409769 543705 memory.go:184] no items to output this cycle
I0320 08:42:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 08:42:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:42:13.409823 543705 memory.go:191] Add success.
I0320 08:42:13.409830 543705 cpu.go:282] Add success.
W0320 08:42:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:42:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:42:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:42:13.420779 543705 net.go:648] Add success.
I0320 08:42:13.423768 543705 net.go:770] primary dev: ETH0
I0320 08:42:13.423781 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:42:13.423792 543705 net.go:698] Add success.
I0320 08:42:13.467684 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ab6a0dcf-e6b5-43fd-8abd-8d7b52c8dcfa","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 08:42:13.467715 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 08:42:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:42:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 08:42:14.455187 543705 disk_worker.go:728] disk inode is not compliant
E0320 08:42:14.455987 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 08:42:14.455996 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 08:42:14.456002 543705 custom_config.go:64] query custom config with name: gpu
I0320 08:42:14.456710 543705 disk_worker.go:494] system disk:vda1
I0320 08:42:14.456740 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 08:42:15.456777 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 08:42:15.456785 543705 custom_config.go:64] query custom config with name: huawei_npu
E0320 08:42:16.457918 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 08:42:16.457919 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:42:16.457975 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:42:16.457993 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:42:16.472299 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:42:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:42:23.409776 543705 cpu.go:275] no items to output this cycle
I0320 08:42:23.409780 543705 memory.go:184] no items to output this cycle
I0320 08:42:23.505667 543705 disk_info.go:125] begin check local disk info of client
I0320 08:42:23.508075 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:42:23.508080 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa140 0xc0001fa180]
E0320 08:42:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:42:33.409774 543705 memory.go:184] no items to output this cycle
I0320 08:42:33.409790 543705 cpu.go:275] no items to output this cycle
I0320 08:42:38.349736 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 08:42:38.349744 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 08:42:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:42:43.410739 543705 memory.go:191] Add success.
I0320 08:42:43.409798 543705 cpu.go:282] Add success.
I0320 08:42:43.420467 543705 net.go:648] Add success.
I0320 08:42:43.423378 543705 net.go:770] primary dev: ETH0
I0320 08:42:43.423394 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:42:43.423409 543705 net.go:698] Add success.
I0320 08:42:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:42:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:42:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:42:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:42:53.409799 543705 memory.go:184] no items to output this cycle
I0320 08:42:53.409814 543705 cpu.go:275] no items to output this cycle
E0320 08:43:03.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:43:03.409763 543705 memory.go:184] no items to output this cycle
I0320 08:43:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 08:43:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:43:13.409805 543705 memory.go:191] Add success.
I0320 08:43:13.409822 543705 cpu.go:282] Add success.
W0320 08:43:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:43:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:43:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:43:13.420426 543705 net.go:648] Add success.
I0320 08:43:13.423004 543705 net.go:770] primary dev: ETH0
I0320 08:43:13.423017 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:43:13.423028 543705 net.go:698] Add success.
I0320 08:43:14.453955 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:43:14.455225 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:43:14.455235 543705 disk_worker.go:708] disk space is not compliant
W0320 08:43:14.455238 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:43:14.456646 543705 disk_worker.go:494] system disk:vda1
I0320 08:43:14.456676 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:43:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:43:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:43:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:43:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:43:16.472383 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:43:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:43:23.409765 543705 memory.go:184] no items to output this cycle
I0320 08:43:23.409788 543705 cpu.go:275] no items to output this cycle
I0320 08:43:23.508731 543705 disk_info.go:125] begin check local disk info of client
I0320 08:43:23.511163 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:43:23.511168 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003f1800 0xc0003f1840]
E0320 08:43:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:43:33.409788 543705 cpu.go:275] no items to output this cycle
I0320 08:43:33.409800 543705 memory.go:184] no items to output this cycle
E0320 08:43:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:43:43.409787 543705 memory.go:191] Add success.
I0320 08:43:43.409805 543705 cpu.go:282] Add success.
I0320 08:43:43.419865 543705 net.go:648] Add success.
I0320 08:43:43.422794 543705 net.go:770] primary dev: ETH0
I0320 08:43:43.422807 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:43:43.422822 543705 net.go:698] Add success.
I0320 08:43:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:43:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:43:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:43:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:43:53.409780 543705 cpu.go:275] no items to output this cycle
I0320 08:43:53.409783 543705 memory.go:184] no items to output this cycle
E0320 08:44:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:44:03.409782 543705 memory.go:184] no items to output this cycle
I0320 08:44:03.409802 543705 cpu.go:275] no items to output this cycle
E0320 08:44:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:44:13.409802 543705 memory.go:191] Add success.
I0320 08:44:13.409822 543705 cpu.go:282] Add success.
W0320 08:44:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:44:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:44:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:44:13.420433 543705 net.go:648] Add success.
I0320 08:44:13.423305 543705 net.go:770] primary dev: ETH0
I0320 08:44:13.423319 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:44:13.423334 543705 net.go:698] Add success.
I0320 08:44:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:44:14.455216 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:44:14.455225 543705 disk_worker.go:708] disk space is not compliant
W0320 08:44:14.455228 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:44:14.456620 543705 disk_worker.go:494] system disk:vda1
I0320 08:44:14.456649 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:44:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:44:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:44:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:44:16.458078 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:44:16.472393 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:44:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:44:23.409798 543705 memory.go:184] no items to output this cycle
I0320 08:44:23.409806 543705 cpu.go:275] no items to output this cycle
I0320 08:44:23.511755 543705 disk_info.go:125] begin check local disk info of client
I0320 08:44:23.514298 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:44:23.514303 543705 disk_info.go:196] parse disk info done, disk is : [0xc000546d80 0xc000546dc0]
E0320 08:44:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:44:33.409806 543705 memory.go:184] no items to output this cycle
I0320 08:44:33.409822 543705 cpu.go:275] no items to output this cycle
E0320 08:44:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:44:43.409821 543705 memory.go:191] Add success.
I0320 08:44:43.409825 543705 cpu.go:282] Add success.
I0320 08:44:43.419954 543705 net.go:648] Add success.
I0320 08:44:43.423469 543705 net.go:770] primary dev: ETH0
I0320 08:44:43.423483 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:44:43.423496 543705 net.go:698] Add success.
I0320 08:44:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:44:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:44:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:44:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:44:53.409799 543705 memory.go:184] no items to output this cycle
I0320 08:44:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 08:45:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:45:03.409802 543705 memory.go:184] no items to output this cycle
I0320 08:45:03.409827 543705 cpu.go:275] no items to output this cycle
E0320 08:45:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:45:13.409807 543705 memory.go:191] Add success.
I0320 08:45:13.409811 543705 cpu.go:282] Add success.
W0320 08:45:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:45:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:45:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:45:13.420133 543705 net.go:648] Add success.
I0320 08:45:13.423048 543705 net.go:770] primary dev: ETH0
I0320 08:45:13.423061 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:45:13.423073 543705 net.go:698] Add success.
I0320 08:45:13.494629 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"504558e1-c4f0-483c-a892-d5f37feda00d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 08:45:13.494661 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 08:45:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:45:14.455198 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:45:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0320 08:45:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:45:14.456705 543705 disk_worker.go:494] system disk:vda1
I0320 08:45:14.456740 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:45:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:45:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:45:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:45:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:45:16.472391 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:45:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:45:23.409801 543705 memory.go:184] no items to output this cycle
I0320 08:45:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 08:45:23.514706 543705 disk_info.go:125] begin check local disk info of client
I0320 08:45:23.517171 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:45:23.517177 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e8480 0xc0003e84c0]
E0320 08:45:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:45:33.409777 543705 memory.go:184] no items to output this cycle
I0320 08:45:33.409800 543705 cpu.go:275] no items to output this cycle
I0320 08:45:38.353737 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 08:45:38.353744 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 08:45:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:45:43.410522 543705 memory.go:191] Add success.
I0320 08:45:43.409917 543705 cpu.go:282] Add success.
I0320 08:45:43.419709 543705 net.go:648] Add success.
I0320 08:45:43.422166 543705 net.go:770] primary dev: ETH0
I0320 08:45:43.422178 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:45:43.422190 543705 net.go:698] Add success.
I0320 08:45:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:45:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:45:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:45:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:45:53.409783 543705 cpu.go:275] no items to output this cycle
I0320 08:45:53.409790 543705 memory.go:184] no items to output this cycle
E0320 08:46:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:46:03.409804 543705 memory.go:184] no items to output this cycle
I0320 08:46:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 08:46:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:46:13.409795 543705 memory.go:191] Add success.
I0320 08:46:13.409816 543705 cpu.go:282] Add success.
W0320 08:46:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:46:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:46:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:46:13.420258 543705 net.go:648] Add success.
I0320 08:46:13.422766 543705 net.go:770] primary dev: ETH0
I0320 08:46:13.422780 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:46:13.422792 543705 net.go:698] Add success.
I0320 08:46:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:46:14.455203 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:46:14.455215 543705 disk_worker.go:708] disk space is not compliant
W0320 08:46:14.455217 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:46:14.456625 543705 disk_worker.go:494] system disk:vda1
I0320 08:46:14.456657 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:46:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:46:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:46:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:46:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:46:16.472382 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:46:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:46:23.409767 543705 memory.go:184] no items to output this cycle
I0320 08:46:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 08:46:23.517771 543705 disk_info.go:125] begin check local disk info of client
I0320 08:46:23.520157 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:46:23.520162 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004644c0 0xc000464500]
E0320 08:46:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:46:33.409773 543705 memory.go:184] no items to output this cycle
I0320 08:46:33.409793 543705 cpu.go:275] no items to output this cycle
E0320 08:46:43.409847 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:46:43.409875 543705 memory.go:191] Add success.
I0320 08:46:43.409958 543705 cpu.go:282] Add success.
I0320 08:46:43.419709 543705 net.go:648] Add success.
I0320 08:46:43.422286 543705 net.go:770] primary dev: ETH0
I0320 08:46:43.422299 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:46:43.422311 543705 net.go:698] Add success.
I0320 08:46:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:46:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:46:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:46:53.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:46:53.409762 543705 memory.go:184] no items to output this cycle
I0320 08:46:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 08:47:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:47:03.409797 543705 memory.go:184] no items to output this cycle
I0320 08:47:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 08:47:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:47:13.409805 543705 memory.go:191] Add success.
I0320 08:47:13.409807 543705 cpu.go:282] Add success.
W0320 08:47:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:47:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:47:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:47:13.420143 543705 net.go:648] Add success.
I0320 08:47:13.423205 543705 net.go:770] primary dev: ETH0
I0320 08:47:13.423218 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:47:13.423229 543705 net.go:698] Add success.
I0320 08:47:13.453751 543705 event_worker.go:152] Polling the log file for events...
W0320 08:47:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:47:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 08:47:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:47:14.456774 543705 disk_worker.go:494] system disk:vda1
I0320 08:47:14.456813 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 08:47:14.457153 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 08:47:14.457161 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 08:47:14.457166 543705 custom_config.go:64] query custom config with name: gpu
E0320 08:47:15.456784 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 08:47:15.456793 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:47:16.457911 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 08:47:16.457921 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 08:47:16.457987 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:47:16.458004 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:47:16.472348 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:47:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:47:23.409778 543705 memory.go:184] no items to output this cycle
I0320 08:47:23.409780 543705 cpu.go:275] no items to output this cycle
I0320 08:47:23.520794 543705 disk_info.go:125] begin check local disk info of client
I0320 08:47:23.523171 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:47:23.523176 543705 disk_info.go:196] parse disk info done, disk is : [0xc000349ec0 0xc000349f00]
E0320 08:47:33.409904 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:47:33.409917 543705 cpu.go:275] no items to output this cycle
I0320 08:47:33.409926 543705 memory.go:184] no items to output this cycle
E0320 08:47:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:47:43.409782 543705 memory.go:191] Add success.
I0320 08:47:43.409811 543705 cpu.go:282] Add success.
I0320 08:47:43.419885 543705 net.go:648] Add success.
I0320 08:47:43.423166 543705 net.go:770] primary dev: ETH0
I0320 08:47:43.423179 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:47:43.423191 543705 net.go:698] Add success.
I0320 08:47:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:47:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:47:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:47:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:47:53.409798 543705 memory.go:184] no items to output this cycle
I0320 08:47:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 08:48:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:48:03.409776 543705 memory.go:184] no items to output this cycle
I0320 08:48:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 08:48:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:48:13.409808 543705 memory.go:191] Add success.
I0320 08:48:13.409810 543705 cpu.go:282] Add success.
W0320 08:48:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:48:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:48:13.409853 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:48:13.420356 543705 net.go:648] Add success.
I0320 08:48:13.422833 543705 net.go:770] primary dev: ETH0
I0320 08:48:13.422846 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:48:13.422859 543705 net.go:698] Add success.
I0320 08:48:13.475430 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f90c77c9-409e-4416-8dfa-be1b36dc4814","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 08:48:13.475463 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 08:48:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:48:14.455151 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:48:14.455219 543705 disk_worker.go:708] disk space is not compliant
W0320 08:48:14.455222 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:48:14.456645 543705 disk_worker.go:494] system disk:vda1
I0320 08:48:14.456680 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:48:15.454995 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:48:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:48:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:48:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:48:16.472408 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:48:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:48:23.409781 543705 memory.go:184] no items to output this cycle
I0320 08:48:23.409781 543705 cpu.go:275] no items to output this cycle
I0320 08:48:23.523773 543705 disk_info.go:125] begin check local disk info of client
I0320 08:48:23.526207 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:48:23.526213 543705 disk_info.go:196] parse disk info done, disk is : [0xc000497b00 0xc000497b40]
E0320 08:48:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:48:33.409774 543705 memory.go:184] no items to output this cycle
I0320 08:48:33.409853 543705 cpu.go:275] no items to output this cycle
I0320 08:48:38.357740 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 08:48:38.357747 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 08:48:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:48:43.410519 543705 memory.go:191] Add success.
I0320 08:48:43.409802 543705 cpu.go:282] Add success.
I0320 08:48:43.420224 543705 net.go:648] Add success.
I0320 08:48:43.422657 543705 net.go:770] primary dev: ETH0
I0320 08:48:43.422671 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:48:43.422684 543705 net.go:698] Add success.
I0320 08:48:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:48:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:48:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:48:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:48:53.409797 543705 memory.go:184] no items to output this cycle
I0320 08:48:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 08:49:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:49:03.409777 543705 memory.go:184] no items to output this cycle
I0320 08:49:03.409781 543705 cpu.go:275] no items to output this cycle
E0320 08:49:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:49:13.409796 543705 memory.go:191] Add success.
W0320 08:49:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 08:49:13.409827 543705 cpu.go:282] Add success.
W0320 08:49:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:49:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:49:13.420379 543705 net.go:648] Add success.
I0320 08:49:13.423036 543705 net.go:770] primary dev: ETH0
I0320 08:49:13.423050 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:49:13.423063 543705 net.go:698] Add success.
I0320 08:49:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:49:14.455200 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:49:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0320 08:49:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:49:14.456857 543705 disk_worker.go:494] system disk:vda1
I0320 08:49:14.456885 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:49:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:49:16.458001 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:49:16.458062 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:49:16.458084 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:49:16.472425 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:49:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:49:23.409776 543705 memory.go:184] no items to output this cycle
I0320 08:49:23.409781 543705 cpu.go:275] no items to output this cycle
I0320 08:49:23.526290 543705 disk_info.go:125] begin check local disk info of client
I0320 08:49:23.528717 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:49:23.528723 543705 disk_info.go:196] parse disk info done, disk is : [0xc000393b00 0xc000393b40]
E0320 08:49:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:49:33.409783 543705 memory.go:184] no items to output this cycle
I0320 08:49:33.409790 543705 cpu.go:275] no items to output this cycle
E0320 08:49:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:49:43.409793 543705 memory.go:191] Add success.
I0320 08:49:43.409796 543705 cpu.go:282] Add success.
I0320 08:49:43.420018 543705 net.go:648] Add success.
I0320 08:49:43.423221 543705 net.go:770] primary dev: ETH0
I0320 08:49:43.423235 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:49:43.423247 543705 net.go:698] Add success.
I0320 08:49:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:49:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:49:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:49:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:49:53.409790 543705 cpu.go:275] no items to output this cycle
I0320 08:49:53.409792 543705 memory.go:184] no items to output this cycle
E0320 08:50:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:50:03.409793 543705 memory.go:184] no items to output this cycle
I0320 08:50:03.409809 543705 cpu.go:275] no items to output this cycle
E0320 08:50:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:50:13.409809 543705 memory.go:191] Add success.
I0320 08:50:13.409814 543705 cpu.go:282] Add success.
W0320 08:50:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:50:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:50:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:50:13.420150 543705 net.go:648] Add success.
I0320 08:50:13.423145 543705 net.go:770] primary dev: ETH0
I0320 08:50:13.423158 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:50:13.423170 543705 net.go:698] Add success.
I0320 08:50:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:50:14.455312 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:50:14.455420 543705 disk_worker.go:708] disk space is not compliant
W0320 08:50:14.455425 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:50:14.457139 543705 disk_worker.go:494] system disk:vda1
I0320 08:50:14.457181 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:50:15.455949 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:50:16.457963 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:50:16.458025 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:50:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:50:16.472377 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:50:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:50:23.409775 543705 memory.go:184] no items to output this cycle
I0320 08:50:23.409782 543705 cpu.go:275] no items to output this cycle
I0320 08:50:23.528763 543705 disk_info.go:125] begin check local disk info of client
I0320 08:50:23.531244 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:50:23.531249 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004914c0 0xc000491500]
E0320 08:50:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:50:33.409771 543705 memory.go:184] no items to output this cycle
I0320 08:50:33.409804 543705 cpu.go:275] no items to output this cycle
E0320 08:50:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:50:43.409796 543705 memory.go:191] Add success.
I0320 08:50:43.409798 543705 cpu.go:282] Add success.
I0320 08:50:43.420055 543705 net.go:648] Add success.
I0320 08:50:43.423047 543705 net.go:770] primary dev: ETH0
I0320 08:50:43.423062 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:50:43.423075 543705 net.go:698] Add success.
I0320 08:50:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:50:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:50:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:50:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:50:53.409794 543705 memory.go:184] no items to output this cycle
I0320 08:50:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 08:51:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:51:03.409776 543705 memory.go:184] no items to output this cycle
I0320 08:51:03.409784 543705 cpu.go:275] no items to output this cycle
E0320 08:51:13.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:51:13.409829 543705 memory.go:191] Add success.
I0320 08:51:13.409835 543705 cpu.go:282] Add success.
W0320 08:51:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:51:13.409878 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:51:13.409882 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:51:13.420211 543705 net.go:648] Add success.
I0320 08:51:13.423388 543705 net.go:770] primary dev: ETH0
I0320 08:51:13.423401 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:51:13.423413 543705 net.go:698] Add success.
I0320 08:51:13.573137 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"31f054c0-c773-4b0d-823c-046b856523e6","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 08:51:13.573170 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 08:51:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:51:14.455191 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:51:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 08:51:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:51:14.459207 543705 disk_worker.go:494] system disk:vda1
I0320 08:51:14.459247 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:51:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:51:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:51:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:51:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:51:16.472390 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:51:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:51:23.409767 543705 memory.go:184] no items to output this cycle
I0320 08:51:23.409800 543705 cpu.go:275] no items to output this cycle
I0320 08:51:23.531810 543705 disk_info.go:125] begin check local disk info of client
I0320 08:51:23.534265 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:51:23.534270 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004961c0 0xc000496200]
E0320 08:51:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:51:33.409796 543705 memory.go:184] no items to output this cycle
I0320 08:51:33.409808 543705 cpu.go:275] no items to output this cycle
I0320 08:51:38.361733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 08:51:38.361739 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 08:51:43.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:51:43.410679 543705 memory.go:191] Add success.
I0320 08:51:43.409802 543705 cpu.go:282] Add success.
I0320 08:51:43.420375 543705 net.go:648] Add success.
I0320 08:51:43.423259 543705 net.go:770] primary dev: ETH0
I0320 08:51:43.423271 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:51:43.423284 543705 net.go:698] Add success.
I0320 08:51:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:51:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:51:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:51:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:51:53.409771 543705 memory.go:184] no items to output this cycle
I0320 08:51:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 08:52:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:52:03.409778 543705 memory.go:184] no items to output this cycle
I0320 08:52:03.409783 543705 cpu.go:275] no items to output this cycle
W0320 08:52:13.409718 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:52:13.409735 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:52:13.409740 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:52:13.409808 543705 cpu.go:282] Add success.
E0320 08:52:13.409815 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:52:13.409833 543705 memory.go:191] Add success.
I0320 08:52:13.420316 543705 net.go:648] Add success.
I0320 08:52:13.423472 543705 net.go:770] primary dev: ETH0
I0320 08:52:13.423487 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:52:13.423499 543705 net.go:698] Add success.
W0320 08:52:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:52:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 08:52:14.455197 543705 disk_worker.go:728] disk inode is not compliant
E0320 08:52:14.455945 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 08:52:14.455954 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 08:52:14.455961 543705 custom_config.go:64] query custom config with name: gpu
I0320 08:52:14.456577 543705 disk_worker.go:494] system disk:vda1
I0320 08:52:14.456610 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 08:52:15.455918 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 08:52:15.455927 543705 custom_config.go:64] query custom config with name: huawei_npu
E0320 08:52:16.457080 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 08:52:16.458133 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:52:16.458195 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:52:16.458216 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:52:16.472533 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:52:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:52:23.409794 543705 memory.go:184] no items to output this cycle
I0320 08:52:23.409805 543705 cpu.go:275] no items to output this cycle
I0320 08:52:23.534803 543705 disk_info.go:125] begin check local disk info of client
I0320 08:52:23.537253 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:52:23.537259 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007afc0 0xc00007b000]
E0320 08:52:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:52:33.409809 543705 memory.go:184] no items to output this cycle
I0320 08:52:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 08:52:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:52:43.409785 543705 memory.go:191] Add success.
I0320 08:52:43.409804 543705 cpu.go:282] Add success.
I0320 08:52:43.419980 543705 net.go:648] Add success.
I0320 08:52:43.422660 543705 net.go:770] primary dev: ETH0
I0320 08:52:43.422675 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:52:43.422690 543705 net.go:698] Add success.
I0320 08:52:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:52:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:52:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:52:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:52:53.409791 543705 memory.go:184] no items to output this cycle
I0320 08:52:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 08:53:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:53:03.409772 543705 memory.go:184] no items to output this cycle
I0320 08:53:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 08:53:13.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:53:13.409830 543705 memory.go:191] Add success.
I0320 08:53:13.409836 543705 cpu.go:282] Add success.
W0320 08:53:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:53:13.409880 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:53:13.409884 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:53:13.420152 543705 net.go:648] Add success.
I0320 08:53:13.422878 543705 net.go:770] primary dev: ETH0
I0320 08:53:13.422892 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:53:13.422903 543705 net.go:698] Add success.
I0320 08:53:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:53:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:53:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0320 08:53:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:53:14.456609 543705 disk_worker.go:494] system disk:vda1
I0320 08:53:14.456636 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:53:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:53:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:53:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:53:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:53:16.472401 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:53:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:53:23.409906 543705 memory.go:184] no items to output this cycle
I0320 08:53:23.409914 543705 cpu.go:275] no items to output this cycle
I0320 08:53:23.538306 543705 disk_info.go:125] begin check local disk info of client
I0320 08:53:23.540696 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:53:23.540701 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb4c0 0xc0001fb500]
E0320 08:53:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:53:33.409769 543705 memory.go:184] no items to output this cycle
I0320 08:53:33.409806 543705 cpu.go:275] no items to output this cycle
E0320 08:53:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:53:43.409815 543705 memory.go:191] Add success.
I0320 08:53:43.409826 543705 cpu.go:282] Add success.
I0320 08:53:43.419955 543705 net.go:648] Add success.
I0320 08:53:43.422804 543705 net.go:770] primary dev: ETH0
I0320 08:53:43.422819 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:53:43.422833 543705 net.go:698] Add success.
I0320 08:53:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:53:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:53:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:53:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:53:53.409770 543705 memory.go:184] no items to output this cycle
I0320 08:53:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 08:54:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:54:03.409773 543705 memory.go:184] no items to output this cycle
I0320 08:54:03.409777 543705 cpu.go:275] no items to output this cycle
E0320 08:54:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:54:13.409827 543705 memory.go:191] Add success.
I0320 08:54:13.409837 543705 cpu.go:282] Add success.
W0320 08:54:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:54:13.409877 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:54:13.409881 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:54:13.420139 543705 net.go:648] Add success.
I0320 08:54:13.422846 543705 net.go:770] primary dev: ETH0
I0320 08:54:13.422861 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:54:13.422875 543705 net.go:698] Add success.
I0320 08:54:13.469344 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"873bf6ce-d62b-4c77-8460-0d78f78c4765","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 08:54:13.469377 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 08:54:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:54:14.455140 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:54:14.455222 543705 disk_worker.go:708] disk space is not compliant
W0320 08:54:14.455225 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:54:14.456652 543705 disk_worker.go:494] system disk:vda1
I0320 08:54:14.456686 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:54:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:54:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:54:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:54:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:54:16.472399 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:54:23.410864 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:54:23.410889 543705 memory.go:184] no items to output this cycle
I0320 08:54:23.410974 543705 cpu.go:275] no items to output this cycle
I0320 08:54:23.541418 543705 disk_info.go:125] begin check local disk info of client
I0320 08:54:23.543807 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:54:23.543814 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bbdc0 0xc0002bbe00]
E0320 08:54:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:54:33.409801 543705 memory.go:184] no items to output this cycle
I0320 08:54:33.409817 543705 cpu.go:275] no items to output this cycle
I0320 08:54:38.365741 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 08:54:38.365749 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 08:54:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:54:43.410575 543705 memory.go:191] Add success.
I0320 08:54:43.409818 543705 cpu.go:282] Add success.
I0320 08:54:43.420307 543705 net.go:648] Add success.
I0320 08:54:43.423327 543705 net.go:770] primary dev: ETH0
I0320 08:54:43.423341 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:54:43.423365 543705 net.go:698] Add success.
I0320 08:54:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:54:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:54:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:54:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:54:53.409795 543705 memory.go:184] no items to output this cycle
I0320 08:54:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 08:55:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:55:03.409780 543705 memory.go:184] no items to output this cycle
I0320 08:55:03.409783 543705 cpu.go:275] no items to output this cycle
W0320 08:55:13.409720 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:55:13.409738 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:55:13.409743 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:55:13.409822 543705 cpu.go:282] Add success.
E0320 08:55:13.409841 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:55:13.409867 543705 memory.go:191] Add success.
I0320 08:55:13.420247 543705 net.go:648] Add success.
I0320 08:55:13.422790 543705 net.go:770] primary dev: ETH0
I0320 08:55:13.422804 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:55:13.422815 543705 net.go:698] Add success.
I0320 08:55:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:55:14.455209 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:55:14.455221 543705 disk_worker.go:708] disk space is not compliant
W0320 08:55:14.455223 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:55:14.456651 543705 disk_worker.go:494] system disk:vda1
I0320 08:55:14.456682 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:55:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:55:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:55:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:55:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:55:16.472383 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:55:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:55:23.409802 543705 memory.go:184] no items to output this cycle
I0320 08:55:23.409817 543705 cpu.go:275] no items to output this cycle
I0320 08:55:23.543837 543705 disk_info.go:125] begin check local disk info of client
I0320 08:55:23.546543 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:55:23.546549 543705 disk_info.go:196] parse disk info done, disk is : [0xc000380000 0xc000380040]
E0320 08:55:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:55:33.409778 543705 memory.go:184] no items to output this cycle
I0320 08:55:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 08:55:43.409866 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:55:43.409915 543705 memory.go:191] Add success.
I0320 08:55:43.409932 543705 cpu.go:282] Add success.
I0320 08:55:43.420306 543705 net.go:648] Add success.
I0320 08:55:43.422953 543705 net.go:770] primary dev: ETH0
I0320 08:55:43.422966 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:55:43.422979 543705 net.go:698] Add success.
I0320 08:55:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:55:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:55:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:55:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:55:53.409787 543705 memory.go:184] no items to output this cycle
I0320 08:55:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 08:56:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:56:03.409779 543705 memory.go:184] no items to output this cycle
I0320 08:56:03.409779 543705 cpu.go:275] no items to output this cycle
E0320 08:56:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:56:13.409807 543705 memory.go:191] Add success.
I0320 08:56:13.409807 543705 cpu.go:282] Add success.
W0320 08:56:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:56:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:56:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:56:13.420249 543705 net.go:648] Add success.
I0320 08:56:13.423154 543705 net.go:770] primary dev: ETH0
I0320 08:56:13.423169 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:56:13.423182 543705 net.go:698] Add success.
I0320 08:56:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:56:14.455101 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:56:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0320 08:56:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:56:14.456508 543705 disk_worker.go:494] system disk:vda1
I0320 08:56:14.456555 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:56:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:56:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:56:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:56:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:56:16.472395 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:56:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:56:23.409770 543705 memory.go:184] no items to output this cycle
I0320 08:56:23.409797 543705 cpu.go:275] no items to output this cycle
I0320 08:56:23.546619 543705 disk_info.go:125] begin check local disk info of client
I0320 08:56:23.549131 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:56:23.549136 543705 disk_info.go:196] parse disk info done, disk is : [0xc000471c40 0xc000471c80]
E0320 08:56:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:56:33.409806 543705 memory.go:184] no items to output this cycle
I0320 08:56:33.409821 543705 cpu.go:275] no items to output this cycle
E0320 08:56:43.409880 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:56:43.409920 543705 memory.go:191] Add success.
I0320 08:56:43.409921 543705 cpu.go:282] Add success.
I0320 08:56:43.419727 543705 net.go:648] Add success.
I0320 08:56:43.422562 543705 net.go:770] primary dev: ETH0
I0320 08:56:43.422577 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:56:43.422591 543705 net.go:698] Add success.
I0320 08:56:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:56:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:56:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:56:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:56:53.409793 543705 memory.go:184] no items to output this cycle
I0320 08:56:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 08:57:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:57:03.409765 543705 memory.go:184] no items to output this cycle
I0320 08:57:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 08:57:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:57:13.409798 543705 memory.go:191] Add success.
W0320 08:57:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:57:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:57:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:57:13.409847 543705 cpu.go:282] Add success.
I0320 08:57:13.420339 543705 net.go:648] Add success.
I0320 08:57:13.427437 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 08:57:13.427513 543705 net.go:770] primary dev: ETH0
I0320 08:57:13.427536 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:57:13.427549 543705 net.go:698] Add success.
I0320 08:57:13.453146 543705 event_worker.go:152] Polling the log file for events...
I0320 08:57:13.469815 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"bcdd4107-0d98-44d0-badf-a71b7c7d84c6","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 08:57:13.469847 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 08:57:14.455162 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:57:14.455256 543705 disk_worker.go:708] disk space is not compliant
W0320 08:57:14.455261 543705 disk_worker.go:728] disk inode is not compliant
E0320 08:57:14.456063 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 08:57:14.456072 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 08:57:14.456078 543705 custom_config.go:64] query custom config with name: gpu
I0320 08:57:14.457023 543705 disk_worker.go:494] system disk:vda1
I0320 08:57:14.457053 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 08:57:15.456828 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 08:57:15.456836 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:57:16.457951 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 08:57:16.457960 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 08:57:16.458004 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:57:16.458021 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:57:16.472358 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:57:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:57:23.409775 543705 memory.go:184] no items to output this cycle
I0320 08:57:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 08:57:23.549933 543705 disk_info.go:125] begin check local disk info of client
I0320 08:57:23.552390 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:57:23.552395 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035c540 0xc00035c580]
E0320 08:57:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:57:33.409777 543705 memory.go:184] no items to output this cycle
I0320 08:57:33.409798 543705 cpu.go:275] no items to output this cycle
I0320 08:57:38.369728 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 08:57:38.369734 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 08:57:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:57:43.410689 543705 memory.go:191] Add success.
I0320 08:57:43.409817 543705 cpu.go:282] Add success.
I0320 08:57:43.419720 543705 net.go:648] Add success.
I0320 08:57:43.422592 543705 net.go:770] primary dev: ETH0
I0320 08:57:43.422607 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:57:43.422621 543705 net.go:698] Add success.
I0320 08:57:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:57:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:57:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:57:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:57:53.409772 543705 memory.go:184] no items to output this cycle
I0320 08:57:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 08:58:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:58:03.409777 543705 cpu.go:275] no items to output this cycle
I0320 08:58:03.409784 543705 memory.go:184] no items to output this cycle
E0320 08:58:13.409808 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:58:13.409819 543705 cpu.go:282] Add success.
I0320 08:58:13.409843 543705 memory.go:191] Add success.
W0320 08:58:13.409880 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:58:13.409900 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:58:13.409904 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:58:13.420238 543705 net.go:648] Add success.
I0320 08:58:13.421166 543705 net.go:770] primary dev: ETH0
I0320 08:58:13.421184 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:58:13.421201 543705 net.go:698] Add success.
I0320 08:58:14.454991 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:58:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:58:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 08:58:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:58:14.456553 543705 disk_worker.go:494] system disk:vda1
I0320 08:58:14.456613 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:58:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:58:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:58:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:58:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:58:16.472399 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:58:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:58:23.409781 543705 memory.go:184] no items to output this cycle
I0320 08:58:23.409782 543705 cpu.go:275] no items to output this cycle
I0320 08:58:23.552904 543705 disk_info.go:125] begin check local disk info of client
I0320 08:58:23.555382 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:58:23.555387 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035c500 0xc00035c540]
E0320 08:58:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:58:33.409781 543705 memory.go:184] no items to output this cycle
I0320 08:58:33.409799 543705 cpu.go:275] no items to output this cycle
E0320 08:58:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:58:43.409800 543705 memory.go:191] Add success.
I0320 08:58:43.409800 543705 cpu.go:282] Add success.
I0320 08:58:43.419746 543705 net.go:648] Add success.
I0320 08:58:43.423007 543705 net.go:770] primary dev: ETH0
I0320 08:58:43.423020 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:58:43.423031 543705 net.go:698] Add success.
I0320 08:58:46.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:58:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:58:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:58:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:58:53.409795 543705 memory.go:184] no items to output this cycle
I0320 08:58:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 08:59:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:59:03.409788 543705 cpu.go:275] no items to output this cycle
I0320 08:59:03.409801 543705 memory.go:184] no items to output this cycle
E0320 08:59:13.409805 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:59:13.409821 543705 cpu.go:282] Add success.
I0320 08:59:13.409837 543705 memory.go:191] Add success.
W0320 08:59:13.409875 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 08:59:13.409895 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 08:59:13.409900 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 08:59:13.420341 543705 net.go:648] Add success.
I0320 08:59:13.421316 543705 net.go:770] primary dev: ETH0
I0320 08:59:13.421330 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:59:13.421342 543705 net.go:698] Add success.
I0320 08:59:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 08:59:14.455214 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 08:59:14.455226 543705 disk_worker.go:708] disk space is not compliant
W0320 08:59:14.455229 543705 disk_worker.go:728] disk inode is not compliant
I0320 08:59:14.456633 543705 disk_worker.go:494] system disk:vda1
I0320 08:59:14.456665 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 08:59:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 08:59:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:59:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:59:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 08:59:16.472438 543705 disk_local_worker.go:436] Get disk info: []
E0320 08:59:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:59:23.409783 543705 memory.go:184] no items to output this cycle
I0320 08:59:23.409799 543705 cpu.go:275] no items to output this cycle
I0320 08:59:23.555463 543705 disk_info.go:125] begin check local disk info of client
I0320 08:59:23.557969 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 08:59:23.557974 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035bdc0 0xc00035be00]
E0320 08:59:33.409802 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:59:33.409824 543705 memory.go:184] no items to output this cycle
I0320 08:59:33.409828 543705 cpu.go:275] no items to output this cycle
E0320 08:59:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:59:43.409790 543705 memory.go:191] Add success.
I0320 08:59:43.409813 543705 cpu.go:282] Add success.
I0320 08:59:43.419838 543705 net.go:648] Add success.
I0320 08:59:43.422391 543705 net.go:770] primary dev: ETH0
I0320 08:59:43.422405 543705 net.go:802] Send network stats successfully!,count is 6
I0320 08:59:43.422416 543705 net.go:698] Add success.
I0320 08:59:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 08:59:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 08:59:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 08:59:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 08:59:53.409811 543705 memory.go:184] no items to output this cycle
I0320 08:59:53.409820 543705 cpu.go:275] no items to output this cycle
E0320 09:00:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:00:03.409805 543705 memory.go:184] no items to output this cycle
I0320 09:00:03.409818 543705 cpu.go:275] no items to output this cycle
I0320 09:00:13.409821 543705 cpu.go:282] Add success.
E0320 09:00:13.410129 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:00:13.410148 543705 memory.go:191] Add success.
W0320 09:00:13.410172 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:00:13.410184 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:00:13.410186 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:00:13.420389 543705 net.go:648] Add success.
I0320 09:00:13.421480 543705 net.go:770] primary dev: ETH0
I0320 09:00:13.421494 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:00:13.421506 543705 net.go:698] Add success.
I0320 09:00:13.469021 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2a49a950-c78e-4091-b64d-54b4b4935b79","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 09:00:13.469077 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 09:00:14.454991 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:00:14.455153 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:00:14.455228 543705 disk_worker.go:708] disk space is not compliant
W0320 09:00:14.455231 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:00:14.456642 543705 disk_worker.go:494] system disk:vda1
I0320 09:00:14.456676 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:00:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:00:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:00:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:00:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:00:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:00:23.410235 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:00:23.410254 543705 memory.go:184] no items to output this cycle
I0320 09:00:23.410256 543705 cpu.go:275] no items to output this cycle
I0320 09:00:23.558449 543705 disk_info.go:125] begin check local disk info of client
I0320 09:00:23.560901 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:00:23.560906 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003babc0 0xc0003bac00]
E0320 09:00:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:00:33.409774 543705 memory.go:184] no items to output this cycle
I0320 09:00:33.409783 543705 cpu.go:275] no items to output this cycle
I0320 09:00:38.373738 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 09:00:38.373746 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 09:00:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:00:43.410570 543705 memory.go:191] Add success.
I0320 09:00:43.409836 543705 cpu.go:282] Add success.
I0320 09:00:43.420289 543705 net.go:648] Add success.
I0320 09:00:43.423102 543705 net.go:770] primary dev: ETH0
I0320 09:00:43.423116 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:00:43.423130 543705 net.go:698] Add success.
I0320 09:00:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:00:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:00:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:00:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:00:53.409782 543705 memory.go:184] no items to output this cycle
I0320 09:00:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 09:01:03.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:01:03.409761 543705 memory.go:184] no items to output this cycle
I0320 09:01:03.409795 543705 cpu.go:275] no items to output this cycle
W0320 09:01:13.409726 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:01:13.409749 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:01:13.409755 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:01:13.409842 543705 cpu.go:282] Add success.
E0320 09:01:13.409852 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:01:13.409880 543705 memory.go:191] Add success.
I0320 09:01:13.420502 543705 net.go:648] Add success.
I0320 09:01:13.423300 543705 net.go:770] primary dev: ETH0
I0320 09:01:13.423319 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:01:13.423338 543705 net.go:698] Add success.
I0320 09:01:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:01:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:01:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 09:01:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:01:14.456522 543705 disk_worker.go:494] system disk:vda1
I0320 09:01:14.456572 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:01:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:01:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:01:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:01:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:01:16.472417 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:01:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:01:23.409788 543705 memory.go:184] no items to output this cycle
I0320 09:01:23.409801 543705 cpu.go:275] no items to output this cycle
I0320 09:01:23.560940 543705 disk_info.go:125] begin check local disk info of client
I0320 09:01:23.563419 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:01:23.563424 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003da680 0xc0003da6c0]
E0320 09:01:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:01:33.409811 543705 memory.go:184] no items to output this cycle
I0320 09:01:33.409825 543705 cpu.go:275] no items to output this cycle
E0320 09:01:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:01:43.409796 543705 cpu.go:282] Add success.
I0320 09:01:43.409798 543705 memory.go:191] Add success.
I0320 09:01:43.419835 543705 net.go:648] Add success.
I0320 09:01:43.422431 543705 net.go:770] primary dev: ETH0
I0320 09:01:43.422445 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:01:43.422458 543705 net.go:698] Add success.
I0320 09:01:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:01:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:01:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:01:53.409905 543705 cpu.go:275] no items to output this cycle
E0320 09:01:53.409910 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:01:53.409944 543705 memory.go:184] no items to output this cycle
E0320 09:02:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:02:03.409793 543705 memory.go:184] no items to output this cycle
I0320 09:02:03.409807 543705 cpu.go:275] no items to output this cycle
W0320 09:02:13.409733 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:02:13.409761 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:02:13.409768 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:02:13.409806 543705 cpu.go:282] Add success.
E0320 09:02:13.409878 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:02:13.411440 543705 memory.go:191] Add success.
I0320 09:02:13.420602 543705 net.go:648] Add success.
I0320 09:02:13.422980 543705 net.go:770] primary dev: ETH0
I0320 09:02:13.422993 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:02:13.423005 543705 net.go:698] Add success.
W0320 09:02:14.455210 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:02:14.455224 543705 disk_worker.go:708] disk space is not compliant
W0320 09:02:14.455227 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:02:14.456890 543705 disk_worker.go:494] system disk:vda1
I0320 09:02:14.456936 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 09:02:14.457310 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 09:02:14.457319 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 09:02:14.457324 543705 custom_config.go:64] query custom config with name: gpu
E0320 09:02:15.456854 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 09:02:15.456865 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:02:16.458064 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 09:02:16.458074 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 09:02:16.458122 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:02:16.458142 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:02:16.472506 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:02:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:02:23.409800 543705 memory.go:184] no items to output this cycle
I0320 09:02:23.409814 543705 cpu.go:275] no items to output this cycle
I0320 09:02:23.563507 543705 disk_info.go:125] begin check local disk info of client
I0320 09:02:23.565983 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:02:23.565989 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb280 0xc0001fb2c0]
E0320 09:02:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:02:33.409796 543705 memory.go:184] no items to output this cycle
I0320 09:02:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 09:02:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:02:43.409805 543705 memory.go:191] Add success.
I0320 09:02:43.409806 543705 cpu.go:282] Add success.
I0320 09:02:43.419914 543705 net.go:648] Add success.
I0320 09:02:43.422594 543705 net.go:770] primary dev: ETH0
I0320 09:02:43.422608 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:02:43.422621 543705 net.go:698] Add success.
I0320 09:02:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:02:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:02:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:02:53.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:02:53.409818 543705 memory.go:184] no items to output this cycle
I0320 09:02:53.409831 543705 cpu.go:275] no items to output this cycle
E0320 09:03:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:03:03.409802 543705 memory.go:184] no items to output this cycle
I0320 09:03:03.409809 543705 cpu.go:275] no items to output this cycle
E0320 09:03:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:03:13.409779 543705 memory.go:191] Add success.
W0320 09:03:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:03:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:03:13.409817 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:03:13.409852 543705 cpu.go:282] Add success.
I0320 09:03:13.420404 543705 net.go:648] Add success.
I0320 09:03:13.423393 543705 net.go:770] primary dev: ETH0
I0320 09:03:13.423409 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:03:13.423421 543705 net.go:698] Add success.
I0320 09:03:13.731694 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d03af66a-f353-4911-8478-2b58fdbef580","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 09:03:13.731752 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 09:03:14.454713 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:03:14.454879 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:03:14.454960 543705 disk_worker.go:708] disk space is not compliant
W0320 09:03:14.454964 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:03:14.456537 543705 disk_worker.go:494] system disk:vda1
I0320 09:03:14.456567 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:03:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:03:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:03:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:03:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:03:16.472413 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:03:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:03:23.409809 543705 memory.go:184] no items to output this cycle
I0320 09:03:23.409825 543705 cpu.go:275] no items to output this cycle
I0320 09:03:23.566727 543705 disk_info.go:125] begin check local disk info of client
I0320 09:03:23.569198 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:03:23.569204 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b2c00 0xc0003b2c40]
E0320 09:03:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:03:33.409795 543705 memory.go:184] no items to output this cycle
I0320 09:03:33.409804 543705 cpu.go:275] no items to output this cycle
I0320 09:03:38.377735 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 09:03:38.377741 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 09:03:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:03:43.410722 543705 memory.go:191] Add success.
I0320 09:03:43.409821 543705 cpu.go:282] Add success.
I0320 09:03:43.420506 543705 net.go:648] Add success.
I0320 09:03:43.423343 543705 net.go:770] primary dev: ETH0
I0320 09:03:43.423356 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:03:43.423368 543705 net.go:698] Add success.
I0320 09:03:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:03:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:03:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:03:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:03:53.409771 543705 memory.go:184] no items to output this cycle
I0320 09:03:53.409815 543705 cpu.go:275] no items to output this cycle
E0320 09:04:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:04:03.409768 543705 memory.go:184] no items to output this cycle
I0320 09:04:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 09:04:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:04:13.409792 543705 memory.go:191] Add success.
I0320 09:04:13.409792 543705 cpu.go:282] Add success.
W0320 09:04:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:04:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:04:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:04:13.420246 543705 net.go:648] Add success.
I0320 09:04:13.422883 543705 net.go:770] primary dev: ETH0
I0320 09:04:13.422897 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:04:13.422908 543705 net.go:698] Add success.
I0320 09:04:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:04:14.455215 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:04:14.455227 543705 disk_worker.go:708] disk space is not compliant
W0320 09:04:14.455230 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:04:14.456615 543705 disk_worker.go:494] system disk:vda1
I0320 09:04:14.456649 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:04:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:04:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:04:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:04:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:04:16.472375 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:04:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:04:23.409772 543705 memory.go:184] no items to output this cycle
I0320 09:04:23.409780 543705 cpu.go:275] no items to output this cycle
I0320 09:04:23.569667 543705 disk_info.go:125] begin check local disk info of client
I0320 09:04:23.572157 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:04:23.572162 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001c7c40 0xc0001c7c80]
E0320 09:04:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:04:33.409775 543705 memory.go:184] no items to output this cycle
I0320 09:04:33.409780 543705 cpu.go:275] no items to output this cycle
E0320 09:04:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:04:43.409797 543705 memory.go:191] Add success.
I0320 09:04:43.409803 543705 cpu.go:282] Add success.
I0320 09:04:43.419842 543705 net.go:648] Add success.
I0320 09:04:43.422525 543705 net.go:770] primary dev: ETH0
I0320 09:04:43.422538 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:04:43.422551 543705 net.go:698] Add success.
I0320 09:04:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:04:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:04:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:04:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:04:53.409779 543705 memory.go:184] no items to output this cycle
I0320 09:04:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 09:05:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:05:03.409804 543705 memory.go:184] no items to output this cycle
I0320 09:05:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 09:05:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:05:13.409803 543705 memory.go:191] Add success.
I0320 09:05:13.409805 543705 cpu.go:282] Add success.
W0320 09:05:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:05:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:05:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:05:13.420137 543705 net.go:648] Add success.
I0320 09:05:13.422868 543705 net.go:770] primary dev: ETH0
I0320 09:05:13.422882 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:05:13.422894 543705 net.go:698] Add success.
I0320 09:05:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:05:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:05:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0320 09:05:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:05:14.456550 543705 disk_worker.go:494] system disk:vda1
I0320 09:05:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:05:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:05:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:05:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:05:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:05:16.472375 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:05:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:05:23.409795 543705 memory.go:184] no items to output this cycle
I0320 09:05:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 09:05:23.573092 543705 disk_info.go:125] begin check local disk info of client
I0320 09:05:23.575518 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:05:23.575524 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001c7a80 0xc0001c7ac0]
E0320 09:05:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:05:33.409778 543705 memory.go:184] no items to output this cycle
I0320 09:05:33.409799 543705 cpu.go:275] no items to output this cycle
E0320 09:05:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:05:43.409787 543705 memory.go:191] Add success.
I0320 09:05:43.409790 543705 cpu.go:282] Add success.
I0320 09:05:43.419869 543705 net.go:648] Add success.
I0320 09:05:43.422536 543705 net.go:770] primary dev: ETH0
I0320 09:05:43.422549 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:05:43.422563 543705 net.go:698] Add success.
I0320 09:05:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:05:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:05:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:05:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:05:53.409905 543705 cpu.go:275] no items to output this cycle
I0320 09:05:53.409958 543705 memory.go:184] no items to output this cycle
E0320 09:06:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:06:03.409782 543705 memory.go:184] no items to output this cycle
I0320 09:06:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 09:06:13.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:06:13.409778 543705 memory.go:191] Add success.
W0320 09:06:13.409802 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:06:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:06:13.409816 543705 cpu.go:282] Add success.
I0320 09:06:13.409817 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:06:13.420142 543705 net.go:648] Add success.
I0320 09:06:13.422852 543705 net.go:770] primary dev: ETH0
I0320 09:06:13.422865 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:06:13.422877 543705 net.go:698] Add success.
I0320 09:06:13.468933 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e70b7c07-5866-4f67-b38f-19acb1fcac36","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 09:06:13.468967 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 09:06:14.454985 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:06:14.455166 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:06:14.455245 543705 disk_worker.go:708] disk space is not compliant
W0320 09:06:14.455249 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:06:14.456676 543705 disk_worker.go:494] system disk:vda1
I0320 09:06:14.456709 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:06:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:06:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:06:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:06:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:06:16.472409 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:06:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:06:23.409780 543705 memory.go:184] no items to output this cycle
I0320 09:06:23.409786 543705 cpu.go:275] no items to output this cycle
I0320 09:06:23.576065 543705 disk_info.go:125] begin check local disk info of client
I0320 09:06:23.578476 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:06:23.578482 543705 disk_info.go:196] parse disk info done, disk is : [0xc000480740 0xc000480780]
E0320 09:06:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:06:33.409779 543705 memory.go:184] no items to output this cycle
I0320 09:06:33.409789 543705 cpu.go:275] no items to output this cycle
I0320 09:06:38.381726 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 09:06:38.381733 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 09:06:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:06:43.410630 543705 memory.go:191] Add success.
I0320 09:06:43.409808 543705 cpu.go:282] Add success.
I0320 09:06:43.420472 543705 net.go:648] Add success.
I0320 09:06:43.423049 543705 net.go:770] primary dev: ETH0
I0320 09:06:43.423062 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:06:43.423074 543705 net.go:698] Add success.
I0320 09:06:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:06:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:06:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:06:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:06:53.409768 543705 memory.go:184] no items to output this cycle
I0320 09:06:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 09:07:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:07:03.409776 543705 memory.go:184] no items to output this cycle
I0320 09:07:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 09:07:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:07:13.409813 543705 memory.go:191] Add success.
I0320 09:07:13.409822 543705 cpu.go:282] Add success.
W0320 09:07:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:07:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:07:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:07:13.420238 543705 net.go:648] Add success.
I0320 09:07:13.422788 543705 net.go:770] primary dev: ETH0
I0320 09:07:13.422802 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:07:13.422815 543705 net.go:698] Add success.
I0320 09:07:13.453366 543705 event_worker.go:152] Polling the log file for events...
W0320 09:07:14.455198 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:07:14.455213 543705 disk_worker.go:708] disk space is not compliant
W0320 09:07:14.455216 543705 disk_worker.go:728] disk inode is not compliant
E0320 09:07:14.457150 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
I0320 09:07:14.457158 543705 disk_worker.go:494] system disk:vda1
E0320 09:07:14.457161 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 09:07:14.457168 543705 custom_config.go:64] query custom config with name: gpu
I0320 09:07:14.457194 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 09:07:15.456816 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 09:07:15.456826 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:07:16.458039 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 09:07:16.458050 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 09:07:16.458101 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:07:16.458118 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:07:16.472458 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:07:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:07:23.409774 543705 memory.go:184] no items to output this cycle
I0320 09:07:23.409777 543705 cpu.go:275] no items to output this cycle
I0320 09:07:23.578993 543705 disk_info.go:125] begin check local disk info of client
I0320 09:07:23.581407 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:07:23.581412 543705 disk_info.go:196] parse disk info done, disk is : [0xc000381040 0xc000381080]
E0320 09:07:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:07:33.409790 543705 cpu.go:275] no items to output this cycle
I0320 09:07:33.409795 543705 memory.go:184] no items to output this cycle
E0320 09:07:43.409884 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:07:43.409918 543705 memory.go:191] Add success.
I0320 09:07:43.409928 543705 cpu.go:282] Add success.
I0320 09:07:43.419709 543705 net.go:648] Add success.
I0320 09:07:43.422405 543705 net.go:770] primary dev: ETH0
I0320 09:07:43.422419 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:07:43.422431 543705 net.go:698] Add success.
I0320 09:07:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:07:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:07:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:07:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:07:53.409797 543705 memory.go:184] no items to output this cycle
I0320 09:07:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 09:08:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:08:03.409770 543705 memory.go:184] no items to output this cycle
I0320 09:08:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 09:08:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:08:13.409797 543705 memory.go:191] Add success.
I0320 09:08:13.409800 543705 cpu.go:282] Add success.
W0320 09:08:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:08:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:08:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:08:13.420133 543705 net.go:648] Add success.
I0320 09:08:13.422730 543705 net.go:770] primary dev: ETH0
I0320 09:08:13.422743 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:08:13.422757 543705 net.go:698] Add success.
I0320 09:08:14.454988 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:08:14.455223 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:08:14.455235 543705 disk_worker.go:708] disk space is not compliant
W0320 09:08:14.455239 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:08:14.456671 543705 disk_worker.go:494] system disk:vda1
I0320 09:08:14.456705 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:08:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:08:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:08:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:08:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:08:16.472428 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:08:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:08:23.409781 543705 memory.go:184] no items to output this cycle
I0320 09:08:23.409786 543705 cpu.go:275] no items to output this cycle
I0320 09:08:23.581668 543705 disk_info.go:125] begin check local disk info of client
I0320 09:08:23.584098 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:08:23.584104 543705 disk_info.go:196] parse disk info done, disk is : [0xc000307c00 0xc000307c40]
E0320 09:08:33.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:08:33.409812 543705 memory.go:184] no items to output this cycle
I0320 09:08:33.409824 543705 cpu.go:275] no items to output this cycle
E0320 09:08:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:08:43.409805 543705 memory.go:191] Add success.
I0320 09:08:43.409806 543705 cpu.go:282] Add success.
I0320 09:08:43.420318 543705 net.go:648] Add success.
I0320 09:08:43.422806 543705 net.go:770] primary dev: ETH0
I0320 09:08:43.422819 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:08:43.422831 543705 net.go:698] Add success.
I0320 09:08:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:08:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:08:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:08:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:08:53.409778 543705 memory.go:184] no items to output this cycle
I0320 09:08:53.409780 543705 cpu.go:275] no items to output this cycle
E0320 09:09:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:09:03.409778 543705 memory.go:184] no items to output this cycle
I0320 09:09:03.409782 543705 cpu.go:275] no items to output this cycle
E0320 09:09:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:09:13.409815 543705 memory.go:191] Add success.
I0320 09:09:13.409822 543705 cpu.go:282] Add success.
W0320 09:09:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:09:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:09:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:09:13.420161 543705 net.go:648] Add success.
I0320 09:09:13.423049 543705 net.go:770] primary dev: ETH0
I0320 09:09:13.423064 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:09:13.423078 543705 net.go:698] Add success.
I0320 09:09:13.464209 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e861b529-13ef-48f2-ac7a-e9836c5a8281","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 09:09:13.464243 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 09:09:14.454984 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:09:14.455145 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:09:14.455233 543705 disk_worker.go:708] disk space is not compliant
W0320 09:09:14.455237 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:09:14.456656 543705 disk_worker.go:494] system disk:vda1
I0320 09:09:14.456689 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:09:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:09:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:09:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:09:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:09:16.472374 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:09:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:09:23.409798 543705 memory.go:184] no items to output this cycle
I0320 09:09:23.409808 543705 cpu.go:275] no items to output this cycle
I0320 09:09:23.585090 543705 disk_info.go:125] begin check local disk info of client
I0320 09:09:23.587563 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:09:23.587568 543705 disk_info.go:196] parse disk info done, disk is : [0xc000490c00 0xc000490c40]
E0320 09:09:33.409874 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:09:33.409894 543705 memory.go:184] no items to output this cycle
I0320 09:09:33.409941 543705 cpu.go:275] no items to output this cycle
I0320 09:09:38.385741 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 09:09:38.385746 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 09:09:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:09:43.410638 543705 memory.go:191] Add success.
I0320 09:09:43.409824 543705 cpu.go:282] Add success.
I0320 09:09:43.420424 543705 net.go:648] Add success.
I0320 09:09:43.423031 543705 net.go:770] primary dev: ETH0
I0320 09:09:43.423044 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:09:43.423057 543705 net.go:698] Add success.
I0320 09:09:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:09:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:09:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:09:53.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:09:53.409763 543705 memory.go:184] no items to output this cycle
I0320 09:09:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 09:10:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:10:03.409766 543705 memory.go:184] no items to output this cycle
I0320 09:10:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 09:10:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:10:13.409793 543705 memory.go:191] Add success.
I0320 09:10:13.409818 543705 cpu.go:282] Add success.
W0320 09:10:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:10:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:10:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:10:13.420136 543705 net.go:648] Add success.
I0320 09:10:13.422685 543705 net.go:770] primary dev: ETH0
I0320 09:10:13.422698 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:10:13.422709 543705 net.go:698] Add success.
I0320 09:10:14.454988 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:10:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:10:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0320 09:10:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:10:14.456559 543705 disk_worker.go:494] system disk:vda1
I0320 09:10:14.456613 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:10:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:10:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:10:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:10:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:10:16.472394 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:10:23.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:10:23.409812 543705 memory.go:184] no items to output this cycle
I0320 09:10:23.409813 543705 cpu.go:275] no items to output this cycle
I0320 09:10:23.587676 543705 disk_info.go:125] begin check local disk info of client
I0320 09:10:23.590106 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:10:23.590112 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b7240 0xc0002b7280]
E0320 09:10:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:10:33.409770 543705 memory.go:184] no items to output this cycle
I0320 09:10:33.409798 543705 cpu.go:275] no items to output this cycle
E0320 09:10:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:10:43.409794 543705 memory.go:191] Add success.
I0320 09:10:43.409799 543705 cpu.go:282] Add success.
I0320 09:10:43.420006 543705 net.go:648] Add success.
I0320 09:10:43.422905 543705 net.go:770] primary dev: ETH0
I0320 09:10:43.422920 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:10:43.422932 543705 net.go:698] Add success.
I0320 09:10:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:10:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:10:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:10:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:10:53.409768 543705 memory.go:184] no items to output this cycle
I0320 09:10:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 09:11:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:11:03.409772 543705 memory.go:184] no items to output this cycle
I0320 09:11:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 09:11:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:11:13.409782 543705 memory.go:191] Add success.
I0320 09:11:13.409802 543705 cpu.go:282] Add success.
W0320 09:11:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:11:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:11:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:11:13.420122 543705 net.go:648] Add success.
I0320 09:11:13.422910 543705 net.go:770] primary dev: ETH0
I0320 09:11:13.422922 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:11:13.422934 543705 net.go:698] Add success.
I0320 09:11:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:11:14.455224 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:11:14.455237 543705 disk_worker.go:708] disk space is not compliant
W0320 09:11:14.455239 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:11:14.456689 543705 disk_worker.go:494] system disk:vda1
I0320 09:11:14.456725 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:11:15.454985 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:11:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:11:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:11:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:11:16.472383 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:11:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:11:23.409801 543705 memory.go:184] no items to output this cycle
I0320 09:11:23.409813 543705 cpu.go:275] no items to output this cycle
I0320 09:11:23.590723 543705 disk_info.go:125] begin check local disk info of client
I0320 09:11:23.593144 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:11:23.593149 543705 disk_info.go:196] parse disk info done, disk is : [0xc0005474c0 0xc000547500]
E0320 09:11:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:11:33.409779 543705 memory.go:184] no items to output this cycle
I0320 09:11:33.409797 543705 cpu.go:275] no items to output this cycle
E0320 09:11:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:11:43.409820 543705 memory.go:191] Add success.
I0320 09:11:43.409822 543705 cpu.go:282] Add success.
I0320 09:11:43.419868 543705 net.go:648] Add success.
I0320 09:11:43.422537 543705 net.go:770] primary dev: ETH0
I0320 09:11:43.422550 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:11:43.422563 543705 net.go:698] Add success.
I0320 09:11:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:11:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:11:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:11:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:11:53.409806 543705 memory.go:184] no items to output this cycle
I0320 09:11:53.409819 543705 cpu.go:275] no items to output this cycle
E0320 09:12:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:12:03.409774 543705 memory.go:184] no items to output this cycle
I0320 09:12:03.409780 543705 cpu.go:275] no items to output this cycle
E0320 09:12:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:12:13.409827 543705 memory.go:191] Add success.
I0320 09:12:13.409833 543705 cpu.go:282] Add success.
W0320 09:12:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:12:13.409876 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:12:13.409879 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:12:13.420121 543705 net.go:648] Add success.
I0320 09:12:13.423038 543705 net.go:770] primary dev: ETH0
I0320 09:12:13.423050 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:12:13.423063 543705 net.go:698] Add success.
I0320 09:12:13.467327 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"42a8d73c-0c50-48bc-8348-7d9ad5f2e0d9","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 09:12:13.467360 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 09:12:14.455231 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:12:14.455245 543705 disk_worker.go:708] disk space is not compliant
W0320 09:12:14.455247 543705 disk_worker.go:728] disk inode is not compliant
E0320 09:12:14.456557 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 09:12:14.456568 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 09:12:14.456575 543705 custom_config.go:64] query custom config with name: gpu
I0320 09:12:14.457080 543705 disk_worker.go:494] system disk:vda1
I0320 09:12:14.457125 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 09:12:15.456833 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 09:12:15.456841 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:12:16.457932 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 09:12:16.457932 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 09:12:16.457985 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:12:16.458005 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:12:16.472335 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:12:23.410404 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:12:23.410419 543705 memory.go:184] no items to output this cycle
I0320 09:12:23.410459 543705 cpu.go:275] no items to output this cycle
I0320 09:12:23.593669 543705 disk_info.go:125] begin check local disk info of client
I0320 09:12:23.596040 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:12:23.596045 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b5ac0 0xc0002b5b00]
E0320 09:12:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:12:33.409797 543705 memory.go:184] no items to output this cycle
I0320 09:12:33.409809 543705 cpu.go:275] no items to output this cycle
I0320 09:12:38.389735 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 09:12:38.389742 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 09:12:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:12:43.410523 543705 memory.go:191] Add success.
I0320 09:12:43.409829 543705 cpu.go:282] Add success.
I0320 09:12:43.420316 543705 net.go:648] Add success.
I0320 09:12:43.422890 543705 net.go:770] primary dev: ETH0
I0320 09:12:43.422904 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:12:43.422918 543705 net.go:698] Add success.
I0320 09:12:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:12:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:12:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:12:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:12:53.409781 543705 memory.go:184] no items to output this cycle
I0320 09:12:53.409782 543705 cpu.go:275] no items to output this cycle
E0320 09:13:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:13:03.409798 543705 memory.go:184] no items to output this cycle
I0320 09:13:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 09:13:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:13:13.409817 543705 memory.go:191] Add success.
I0320 09:13:13.409827 543705 cpu.go:282] Add success.
W0320 09:13:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:13:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:13:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:13:13.420166 543705 net.go:648] Add success.
I0320 09:13:13.423259 543705 net.go:770] primary dev: ETH0
I0320 09:13:13.423272 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:13:13.423284 543705 net.go:698] Add success.
I0320 09:13:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:13:14.455216 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:13:14.455230 543705 disk_worker.go:708] disk space is not compliant
W0320 09:13:14.455232 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:13:14.456944 543705 disk_worker.go:494] system disk:vda1
I0320 09:13:14.456978 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:13:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:13:16.457995 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:13:16.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:13:16.458081 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:13:16.472457 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:13:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:13:23.409799 543705 memory.go:184] no items to output this cycle
I0320 09:13:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 09:13:23.596115 543705 disk_info.go:125] begin check local disk info of client
I0320 09:13:23.598550 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:13:23.598555 543705 disk_info.go:196] parse disk info done, disk is : [0xc00046e280 0xc00046e2c0]
E0320 09:13:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:13:33.409792 543705 memory.go:184] no items to output this cycle
I0320 09:13:33.409797 543705 cpu.go:275] no items to output this cycle
E0320 09:13:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:13:43.409796 543705 memory.go:191] Add success.
I0320 09:13:43.409796 543705 cpu.go:282] Add success.
I0320 09:13:43.419856 543705 net.go:648] Add success.
I0320 09:13:43.422463 543705 net.go:770] primary dev: ETH0
I0320 09:13:43.422478 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:13:43.422493 543705 net.go:698] Add success.
I0320 09:13:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:13:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:13:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:13:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:13:53.409786 543705 memory.go:184] no items to output this cycle
I0320 09:13:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 09:14:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:14:03.409781 543705 memory.go:184] no items to output this cycle
I0320 09:14:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 09:14:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:14:13.409806 543705 memory.go:191] Add success.
I0320 09:14:13.409808 543705 cpu.go:282] Add success.
W0320 09:14:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:14:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:14:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:14:13.420082 543705 net.go:648] Add success.
I0320 09:14:13.423103 543705 net.go:770] primary dev: ETH0
I0320 09:14:13.423117 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:14:13.423133 543705 net.go:698] Add success.
I0320 09:14:14.453937 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:14:14.455203 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:14:14.455217 543705 disk_worker.go:708] disk space is not compliant
W0320 09:14:14.455220 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:14:14.456585 543705 disk_worker.go:494] system disk:vda1
I0320 09:14:14.456733 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:14:15.455950 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:14:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:14:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:14:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:14:16.472367 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:14:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:14:23.409779 543705 memory.go:184] no items to output this cycle
I0320 09:14:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 09:14:23.599141 543705 disk_info.go:125] begin check local disk info of client
I0320 09:14:23.601534 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:14:23.601539 543705 disk_info.go:196] parse disk info done, disk is : [0xc000247080 0xc0002470c0]
E0320 09:14:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:14:33.409790 543705 memory.go:184] no items to output this cycle
I0320 09:14:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 09:14:43.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:14:43.409831 543705 memory.go:191] Add success.
I0320 09:14:43.409836 543705 cpu.go:282] Add success.
I0320 09:14:43.419997 543705 net.go:648] Add success.
I0320 09:14:43.423222 543705 net.go:770] primary dev: ETH0
I0320 09:14:43.423235 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:14:43.423249 543705 net.go:698] Add success.
I0320 09:14:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:14:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:14:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:14:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:14:53.409779 543705 memory.go:184] no items to output this cycle
I0320 09:14:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 09:15:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:15:03.409807 543705 memory.go:184] no items to output this cycle
I0320 09:15:03.409821 543705 cpu.go:275] no items to output this cycle
W0320 09:15:13.409713 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:15:13.409736 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:15:13.409742 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0320 09:15:13.409848 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:15:13.409849 543705 cpu.go:282] Add success.
I0320 09:15:13.409870 543705 memory.go:191] Add success.
I0320 09:15:13.420063 543705 net.go:648] Add success.
I0320 09:15:13.422799 543705 net.go:770] primary dev: ETH0
I0320 09:15:13.422815 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:15:13.422829 543705 net.go:698] Add success.
I0320 09:15:13.468966 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"34bcd4d7-feae-4376-b1f7-7fc72204dc22","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 09:15:13.469000 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 09:15:14.453955 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:15:14.455332 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:15:14.455348 543705 disk_worker.go:708] disk space is not compliant
W0320 09:15:14.455353 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:15:14.457684 543705 disk_worker.go:494] system disk:vda1
I0320 09:15:14.457722 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:15:15.455982 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:15:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:15:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:15:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:15:16.472434 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:15:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:15:23.409788 543705 memory.go:184] no items to output this cycle
I0320 09:15:23.409795 543705 cpu.go:275] no items to output this cycle
I0320 09:15:23.601669 543705 disk_info.go:125] begin check local disk info of client
I0320 09:15:23.604187 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:15:23.604192 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004c3c80 0xc0004c3cc0]
E0320 09:15:33.409802 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:15:33.409824 543705 memory.go:184] no items to output this cycle
I0320 09:15:33.409839 543705 cpu.go:275] no items to output this cycle
I0320 09:15:38.393733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 09:15:38.393740 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 09:15:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:15:43.410649 543705 memory.go:191] Add success.
I0320 09:15:43.409830 543705 cpu.go:282] Add success.
I0320 09:15:43.420415 543705 net.go:648] Add success.
I0320 09:15:43.423041 543705 net.go:770] primary dev: ETH0
I0320 09:15:43.423056 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:15:43.423068 543705 net.go:698] Add success.
I0320 09:15:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:15:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:15:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:15:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:15:53.409796 543705 memory.go:184] no items to output this cycle
I0320 09:15:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 09:16:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:16:03.409775 543705 memory.go:184] no items to output this cycle
I0320 09:16:03.409803 543705 cpu.go:275] no items to output this cycle
W0320 09:16:13.409709 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:16:13.409726 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:16:13.409730 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:16:13.409796 543705 cpu.go:282] Add success.
E0320 09:16:13.409805 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:16:13.409824 543705 memory.go:191] Add success.
I0320 09:16:13.420135 543705 net.go:648] Add success.
I0320 09:16:13.422908 543705 net.go:770] primary dev: ETH0
I0320 09:16:13.422921 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:16:13.422933 543705 net.go:698] Add success.
I0320 09:16:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:16:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:16:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0320 09:16:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:16:14.458030 543705 disk_worker.go:494] system disk:vda1
I0320 09:16:14.458062 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:16:15.455974 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:16:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:16:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:16:16.458079 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:16:16.472489 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:16:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:16:23.409800 543705 memory.go:184] no items to output this cycle
I0320 09:16:23.409812 543705 cpu.go:275] no items to output this cycle
I0320 09:16:23.605195 543705 disk_info.go:125] begin check local disk info of client
I0320 09:16:23.607773 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:16:23.607779 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001c6e00 0xc0001c6e40]
E0320 09:16:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:16:33.409775 543705 memory.go:184] no items to output this cycle
I0320 09:16:33.409787 543705 cpu.go:275] no items to output this cycle
E0320 09:16:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:16:43.409822 543705 memory.go:191] Add success.
I0320 09:16:43.409833 543705 cpu.go:282] Add success.
I0320 09:16:43.419877 543705 net.go:648] Add success.
I0320 09:16:43.422665 543705 net.go:770] primary dev: ETH0
I0320 09:16:43.422679 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:16:43.422693 543705 net.go:698] Add success.
I0320 09:16:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:16:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:16:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:16:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:16:53.409776 543705 memory.go:184] no items to output this cycle
I0320 09:16:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 09:17:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:17:03.409783 543705 memory.go:184] no items to output this cycle
I0320 09:17:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 09:17:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:17:13.409790 543705 memory.go:191] Add success.
I0320 09:17:13.409792 543705 cpu.go:282] Add success.
W0320 09:17:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:17:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:17:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:17:13.420076 543705 net.go:648] Add success.
I0320 09:17:13.422550 543705 net.go:770] primary dev: ETH0
I0320 09:17:13.422564 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:17:13.422576 543705 net.go:698] Add success.
I0320 09:17:13.453100 543705 event_worker.go:152] Polling the log file for events...
W0320 09:17:14.455362 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:17:14.455380 543705 disk_worker.go:708] disk space is not compliant
W0320 09:17:14.455385 543705 disk_worker.go:728] disk inode is not compliant
E0320 09:17:14.458153 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 09:17:14.458161 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 09:17:14.458166 543705 custom_config.go:64] query custom config with name: gpu
I0320 09:17:14.459244 543705 disk_worker.go:494] system disk:vda1
I0320 09:17:14.459296 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 09:17:15.457066 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 09:17:15.457080 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:17:16.457996 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 09:17:16.457996 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 09:17:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:17:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:17:16.472457 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:17:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:17:23.409774 543705 memory.go:184] no items to output this cycle
I0320 09:17:23.409782 543705 cpu.go:275] no items to output this cycle
I0320 09:17:23.608143 543705 disk_info.go:125] begin check local disk info of client
I0320 09:17:23.610618 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:17:23.610623 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aaac0 0xc0001aab00]
E0320 09:17:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:17:33.409809 543705 memory.go:184] no items to output this cycle
I0320 09:17:33.409823 543705 cpu.go:275] no items to output this cycle
E0320 09:17:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:17:43.409787 543705 memory.go:191] Add success.
I0320 09:17:43.409805 543705 cpu.go:282] Add success.
I0320 09:17:43.420027 543705 net.go:648] Add success.
I0320 09:17:43.422728 543705 net.go:770] primary dev: ETH0
I0320 09:17:43.422742 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:17:43.422754 543705 net.go:698] Add success.
I0320 09:17:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:17:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:17:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:17:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:17:53.409775 543705 memory.go:184] no items to output this cycle
I0320 09:17:53.409778 543705 cpu.go:275] no items to output this cycle
E0320 09:18:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:18:03.409784 543705 memory.go:184] no items to output this cycle
I0320 09:18:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 09:18:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:18:13.409804 543705 memory.go:191] Add success.
I0320 09:18:13.409804 543705 cpu.go:282] Add success.
W0320 09:18:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:18:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:18:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:18:13.420403 543705 net.go:648] Add success.
I0320 09:18:13.423040 543705 net.go:770] primary dev: ETH0
I0320 09:18:13.423054 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:18:13.423067 543705 net.go:698] Add success.
I0320 09:18:13.605473 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"fdffd18c-0093-412e-8888-d0a8a5466285","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 09:18:13.605504 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 09:18:14.454806 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:18:14.454957 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:18:14.454968 543705 disk_worker.go:708] disk space is not compliant
W0320 09:18:14.454971 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:18:14.456604 543705 disk_worker.go:494] system disk:vda1
I0320 09:18:14.456644 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:18:15.455978 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:18:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:18:16.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:18:16.458086 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:18:16.472456 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:18:23.409880 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:18:23.409898 543705 memory.go:184] no items to output this cycle
I0320 09:18:23.409907 543705 cpu.go:275] no items to output this cycle
I0320 09:18:23.611677 543705 disk_info.go:125] begin check local disk info of client
I0320 09:18:23.614276 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:18:23.614281 543705 disk_info.go:196] parse disk info done, disk is : [0xc000257080 0xc0002570c0]
E0320 09:18:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:18:33.409777 543705 memory.go:184] no items to output this cycle
I0320 09:18:33.409790 543705 cpu.go:275] no items to output this cycle
I0320 09:18:38.397730 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 09:18:38.397737 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 09:18:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:18:43.410609 543705 memory.go:191] Add success.
I0320 09:18:43.409794 543705 cpu.go:282] Add success.
I0320 09:18:43.420317 543705 net.go:648] Add success.
I0320 09:18:43.423057 543705 net.go:770] primary dev: ETH0
I0320 09:18:43.423077 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:18:43.423092 543705 net.go:698] Add success.
I0320 09:18:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:18:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:18:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:18:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:18:53.409782 543705 memory.go:184] no items to output this cycle
I0320 09:18:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 09:19:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:19:03.409782 543705 memory.go:184] no items to output this cycle
I0320 09:19:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 09:19:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:19:13.409810 543705 memory.go:191] Add success.
I0320 09:19:13.409816 543705 cpu.go:282] Add success.
W0320 09:19:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:19:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:19:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:19:13.420116 543705 net.go:648] Add success.
I0320 09:19:13.422691 543705 net.go:770] primary dev: ETH0
I0320 09:19:13.422705 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:19:13.422721 543705 net.go:698] Add success.
I0320 09:19:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:19:14.455275 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:19:14.455291 543705 disk_worker.go:708] disk space is not compliant
W0320 09:19:14.455295 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:19:14.457165 543705 disk_worker.go:494] system disk:vda1
I0320 09:19:14.457199 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:19:15.455981 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:19:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:19:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:19:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:19:16.472436 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:19:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:19:23.409773 543705 memory.go:184] no items to output this cycle
I0320 09:19:23.409801 543705 cpu.go:275] no items to output this cycle
I0320 09:19:23.615210 543705 disk_info.go:125] begin check local disk info of client
I0320 09:19:23.617740 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:19:23.617745 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003de6c0 0xc0003de700]
E0320 09:19:33.409954 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:19:33.409977 543705 memory.go:184] no items to output this cycle
I0320 09:19:33.410015 543705 cpu.go:275] no items to output this cycle
E0320 09:19:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:19:43.409790 543705 memory.go:191] Add success.
I0320 09:19:43.409798 543705 cpu.go:282] Add success.
I0320 09:19:43.420056 543705 net.go:648] Add success.
I0320 09:19:43.422606 543705 net.go:770] primary dev: ETH0
I0320 09:19:43.422620 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:19:43.422632 543705 net.go:698] Add success.
I0320 09:19:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:19:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:19:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:19:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:19:53.409767 543705 memory.go:184] no items to output this cycle
I0320 09:19:53.409779 543705 cpu.go:275] no items to output this cycle
E0320 09:20:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:20:03.409784 543705 memory.go:184] no items to output this cycle
I0320 09:20:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 09:20:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:20:13.409788 543705 memory.go:191] Add success.
I0320 09:20:13.409791 543705 cpu.go:282] Add success.
W0320 09:20:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:20:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:20:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:20:13.420111 543705 net.go:648] Add success.
I0320 09:20:13.423092 543705 net.go:770] primary dev: ETH0
I0320 09:20:13.423105 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:20:13.423117 543705 net.go:698] Add success.
I0320 09:20:14.453933 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:20:14.455201 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:20:14.455293 543705 disk_worker.go:708] disk space is not compliant
W0320 09:20:14.455298 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:20:14.457534 543705 disk_worker.go:494] system disk:vda1
I0320 09:20:14.457583 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:20:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:20:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:20:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:20:16.458080 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:20:16.472450 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:20:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:20:23.409801 543705 memory.go:184] no items to output this cycle
I0320 09:20:23.409812 543705 cpu.go:275] no items to output this cycle
I0320 09:20:23.617825 543705 disk_info.go:125] begin check local disk info of client
I0320 09:20:23.620398 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:20:23.620404 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004b5340 0xc0004b5380]
E0320 09:20:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:20:33.409765 543705 memory.go:184] no items to output this cycle
I0320 09:20:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 09:20:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:20:43.409801 543705 memory.go:191] Add success.
I0320 09:20:43.409802 543705 cpu.go:282] Add success.
I0320 09:20:43.419881 543705 net.go:648] Add success.
I0320 09:20:43.422496 543705 net.go:770] primary dev: ETH0
I0320 09:20:43.422510 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:20:43.422524 543705 net.go:698] Add success.
I0320 09:20:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:20:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:20:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:20:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:20:53.409785 543705 memory.go:184] no items to output this cycle
I0320 09:20:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 09:21:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:21:03.409785 543705 memory.go:184] no items to output this cycle
I0320 09:21:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 09:21:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:21:13.409798 543705 memory.go:191] Add success.
I0320 09:21:13.409798 543705 cpu.go:282] Add success.
W0320 09:21:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:21:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:21:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:21:13.420172 543705 net.go:648] Add success.
I0320 09:21:13.422835 543705 net.go:770] primary dev: ETH0
I0320 09:21:13.422848 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:21:13.422859 543705 net.go:698] Add success.
I0320 09:21:13.468818 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"08413161-335c-4137-a175-4f9dc6eda5cd","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 09:21:13.468850 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 09:21:14.454985 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:21:14.455129 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:21:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 09:21:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:21:14.456496 543705 disk_worker.go:494] system disk:vda1
I0320 09:21:14.456543 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:21:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:21:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:21:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:21:16.458076 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:21:16.472452 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:21:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:21:23.409786 543705 memory.go:184] no items to output this cycle
I0320 09:21:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 09:21:23.621217 543705 disk_info.go:125] begin check local disk info of client
I0320 09:21:23.623747 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:21:23.623752 543705 disk_info.go:196] parse disk info done, disk is : [0xc000487840 0xc000487880]
E0320 09:21:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:21:33.409793 543705 memory.go:184] no items to output this cycle
I0320 09:21:33.409812 543705 cpu.go:275] no items to output this cycle
I0320 09:21:38.401741 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 09:21:38.401748 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 09:21:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:21:43.410605 543705 memory.go:191] Add success.
I0320 09:21:43.409830 543705 cpu.go:282] Add success.
I0320 09:21:43.420473 543705 net.go:648] Add success.
I0320 09:21:43.423098 543705 net.go:770] primary dev: ETH0
I0320 09:21:43.423111 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:21:43.423123 543705 net.go:698] Add success.
I0320 09:21:46.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:21:46.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:21:46.458055 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:21:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:21:53.409805 543705 memory.go:184] no items to output this cycle
I0320 09:21:53.409815 543705 cpu.go:275] no items to output this cycle
E0320 09:22:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:22:03.409787 543705 memory.go:184] no items to output this cycle
I0320 09:22:03.409796 543705 cpu.go:275] no items to output this cycle
W0320 09:22:13.409701 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:22:13.409716 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:22:13.409721 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0320 09:22:13.409802 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:22:13.409821 543705 memory.go:191] Add success.
I0320 09:22:13.409835 543705 cpu.go:282] Add success.
I0320 09:22:13.420035 543705 net.go:648] Add success.
I0320 09:22:13.422700 543705 net.go:770] primary dev: ETH0
I0320 09:22:13.422713 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:22:13.422725 543705 net.go:698] Add success.
W0320 09:22:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:22:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0320 09:22:14.455164 543705 disk_worker.go:728] disk inode is not compliant
E0320 09:22:14.456913 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 09:22:14.456922 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 09:22:14.456928 543705 custom_config.go:64] query custom config with name: gpu
I0320 09:22:14.456991 543705 disk_worker.go:494] system disk:vda1
I0320 09:22:14.457018 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 09:22:15.457035 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 09:22:15.457050 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:22:16.457993 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 09:22:16.457995 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 09:22:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:22:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:22:16.472522 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:22:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:22:23.409774 543705 memory.go:184] no items to output this cycle
I0320 09:22:23.409801 543705 cpu.go:275] no items to output this cycle
I0320 09:22:23.623832 543705 disk_info.go:125] begin check local disk info of client
I0320 09:22:23.626315 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:22:23.626320 543705 disk_info.go:196] parse disk info done, disk is : [0xc000487280 0xc0004872c0]
E0320 09:22:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:22:33.409764 543705 memory.go:184] no items to output this cycle
I0320 09:22:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 09:22:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:22:43.409788 543705 memory.go:191] Add success.
I0320 09:22:43.409817 543705 cpu.go:282] Add success.
I0320 09:22:43.420141 543705 net.go:648] Add success.
I0320 09:22:43.422720 543705 net.go:770] primary dev: ETH0
I0320 09:22:43.422734 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:22:43.422748 543705 net.go:698] Add success.
I0320 09:22:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:22:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:22:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:22:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:22:53.409771 543705 memory.go:184] no items to output this cycle
I0320 09:22:53.409780 543705 cpu.go:275] no items to output this cycle
E0320 09:23:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:23:03.409785 543705 memory.go:184] no items to output this cycle
I0320 09:23:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 09:23:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:23:13.409788 543705 memory.go:191] Add success.
I0320 09:23:13.409798 543705 cpu.go:282] Add success.
W0320 09:23:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:23:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:23:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:23:13.420138 543705 net.go:648] Add success.
I0320 09:23:13.422840 543705 net.go:770] primary dev: ETH0
I0320 09:23:13.422863 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:23:13.422875 543705 net.go:698] Add success.
I0320 09:23:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:23:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:23:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 09:23:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:23:14.456573 543705 disk_worker.go:494] system disk:vda1
I0320 09:23:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:23:15.455984 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:23:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:23:16.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:23:16.458085 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:23:16.472458 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:23:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:23:23.409780 543705 memory.go:184] no items to output this cycle
I0320 09:23:23.409795 543705 cpu.go:275] no items to output this cycle
I0320 09:23:23.626397 543705 disk_info.go:125] begin check local disk info of client
I0320 09:23:23.628942 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:23:23.628947 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035cec0 0xc00035cf00]
E0320 09:23:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:23:33.409780 543705 memory.go:184] no items to output this cycle
I0320 09:23:33.409799 543705 cpu.go:275] no items to output this cycle
E0320 09:23:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:23:43.409817 543705 memory.go:191] Add success.
I0320 09:23:43.409826 543705 cpu.go:282] Add success.
I0320 09:23:43.419853 543705 net.go:648] Add success.
I0320 09:23:43.422649 543705 net.go:770] primary dev: ETH0
I0320 09:23:43.422662 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:23:43.422674 543705 net.go:698] Add success.
I0320 09:23:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:23:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:23:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:23:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:23:53.409781 543705 memory.go:184] no items to output this cycle
I0320 09:23:53.409783 543705 cpu.go:275] no items to output this cycle
E0320 09:24:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:24:03.409792 543705 memory.go:184] no items to output this cycle
I0320 09:24:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 09:24:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:24:13.409781 543705 memory.go:191] Add success.
W0320 09:24:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:24:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:24:13.409818 543705 cpu.go:282] Add success.
I0320 09:24:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:24:13.420128 543705 net.go:648] Add success.
I0320 09:24:13.423364 543705 net.go:770] primary dev: ETH0
I0320 09:24:13.423379 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:24:13.423394 543705 net.go:698] Add success.
I0320 09:24:13.463557 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2a552381-2908-4d4c-b670-da4c73f521c8","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 09:24:13.463591 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 09:24:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:24:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:24:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 09:24:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:24:14.456574 543705 disk_worker.go:494] system disk:vda1
I0320 09:24:14.456603 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:24:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:24:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:24:16.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:24:16.458082 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:24:16.472451 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:24:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:24:23.409769 543705 memory.go:184] no items to output this cycle
I0320 09:24:23.409800 543705 cpu.go:275] no items to output this cycle
I0320 09:24:23.629285 543705 disk_info.go:125] begin check local disk info of client
I0320 09:24:23.631822 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:24:23.631827 543705 disk_info.go:196] parse disk info done, disk is : [0xc000315e00 0xc000315e40]
E0320 09:24:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:24:33.409778 543705 memory.go:184] no items to output this cycle
I0320 09:24:33.409784 543705 cpu.go:275] no items to output this cycle
I0320 09:24:38.405733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 09:24:38.405740 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 09:24:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:24:43.410581 543705 memory.go:191] Add success.
I0320 09:24:43.409825 543705 cpu.go:282] Add success.
I0320 09:24:43.420270 543705 net.go:648] Add success.
I0320 09:24:43.422874 543705 net.go:770] primary dev: ETH0
I0320 09:24:43.422887 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:24:43.422901 543705 net.go:698] Add success.
I0320 09:24:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:24:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:24:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:24:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:24:53.409770 543705 memory.go:184] no items to output this cycle
I0320 09:24:53.409792 543705 cpu.go:275] no items to output this cycle
I0320 09:25:03.409905 543705 cpu.go:275] no items to output this cycle
E0320 09:25:03.409943 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:25:03.409976 543705 memory.go:184] no items to output this cycle
E0320 09:25:13.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:25:13.409822 543705 memory.go:191] Add success.
I0320 09:25:13.409830 543705 cpu.go:282] Add success.
W0320 09:25:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:25:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:25:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:25:13.420094 543705 net.go:648] Add success.
I0320 09:25:13.422593 543705 net.go:770] primary dev: ETH0
I0320 09:25:13.422609 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:25:13.422623 543705 net.go:698] Add success.
I0320 09:25:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:25:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:25:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 09:25:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:25:14.456775 543705 disk_worker.go:494] system disk:vda1
I0320 09:25:14.456804 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:25:15.455978 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:25:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:25:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:25:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:25:16.472458 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:25:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:25:23.409803 543705 memory.go:184] no items to output this cycle
I0320 09:25:23.409817 543705 cpu.go:275] no items to output this cycle
I0320 09:25:23.632298 543705 disk_info.go:125] begin check local disk info of client
I0320 09:25:23.634899 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:25:23.634905 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab000 0xc0001ab040]
E0320 09:25:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:25:33.409809 543705 memory.go:184] no items to output this cycle
I0320 09:25:33.409822 543705 cpu.go:275] no items to output this cycle
E0320 09:25:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:25:43.409802 543705 cpu.go:282] Add success.
I0320 09:25:43.409803 543705 memory.go:191] Add success.
I0320 09:25:43.419902 543705 net.go:648] Add success.
I0320 09:25:43.422968 543705 net.go:770] primary dev: ETH0
I0320 09:25:43.422981 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:25:43.422994 543705 net.go:698] Add success.
I0320 09:25:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:25:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:25:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:25:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:25:53.409785 543705 memory.go:184] no items to output this cycle
I0320 09:25:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 09:26:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:26:03.409778 543705 memory.go:184] no items to output this cycle
I0320 09:26:03.409784 543705 cpu.go:275] no items to output this cycle
E0320 09:26:13.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:26:13.409930 543705 memory.go:191] Add success.
I0320 09:26:13.409941 543705 cpu.go:282] Add success.
W0320 09:26:13.409974 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:26:13.409987 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:26:13.409991 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:26:13.419722 543705 net.go:648] Add success.
I0320 09:26:13.422274 543705 net.go:770] primary dev: ETH0
I0320 09:26:13.422287 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:26:13.422298 543705 net.go:698] Add success.
I0320 09:26:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:26:14.455193 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:26:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0320 09:26:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:26:14.456582 543705 disk_worker.go:494] system disk:vda1
I0320 09:26:14.456611 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:26:15.455975 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:26:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:26:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:26:16.458077 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:26:16.472458 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:26:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:26:23.409771 543705 memory.go:184] no items to output this cycle
I0320 09:26:23.409805 543705 cpu.go:275] no items to output this cycle
I0320 09:26:23.635324 543705 disk_info.go:125] begin check local disk info of client
I0320 09:26:23.637854 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:26:23.637859 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c57c0 0xc0000c5800]
E0320 09:26:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:26:33.409777 543705 memory.go:184] no items to output this cycle
I0320 09:26:33.409781 543705 cpu.go:275] no items to output this cycle
E0320 09:26:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:26:43.409790 543705 memory.go:191] Add success.
I0320 09:26:43.409812 543705 cpu.go:282] Add success.
I0320 09:26:43.420058 543705 net.go:648] Add success.
I0320 09:26:43.422645 543705 net.go:770] primary dev: ETH0
I0320 09:26:43.422657 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:26:43.422671 543705 net.go:698] Add success.
I0320 09:26:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:26:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:26:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:26:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:26:53.409785 543705 memory.go:184] no items to output this cycle
I0320 09:26:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 09:27:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:27:03.409779 543705 memory.go:184] no items to output this cycle
I0320 09:27:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 09:27:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:27:13.409788 543705 memory.go:191] Add success.
I0320 09:27:13.409791 543705 cpu.go:282] Add success.
W0320 09:27:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:27:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:27:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:27:13.420169 543705 net.go:648] Add success.
I0320 09:27:13.423002 543705 net.go:770] primary dev: ETH0
I0320 09:27:13.423015 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:27:13.423026 543705 net.go:698] Add success.
I0320 09:27:13.429274 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 09:27:13.452783 543705 event_worker.go:152] Polling the log file for events...
I0320 09:27:13.468972 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c3ee894b-4451-48a8-944a-128c0f1c1b23","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 09:27:13.469003 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 09:27:14.455119 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:27:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 09:27:14.455198 543705 disk_worker.go:728] disk inode is not compliant
E0320 09:27:14.455842 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 09:27:14.455851 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 09:27:14.455856 543705 custom_config.go:64] query custom config with name: gpu
I0320 09:27:14.456563 543705 disk_worker.go:494] system disk:vda1
I0320 09:27:14.456593 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 09:27:15.457009 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 09:27:15.457024 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:27:16.457946 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 09:27:16.457947 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 09:27:16.458002 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:27:16.458022 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:27:16.472397 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:27:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:27:23.409773 543705 memory.go:184] no items to output this cycle
I0320 09:27:23.409799 543705 cpu.go:275] no items to output this cycle
I0320 09:27:23.637936 543705 disk_info.go:125] begin check local disk info of client
I0320 09:27:23.640450 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:27:23.640456 543705 disk_info.go:196] parse disk info done, disk is : [0xc00052e0c0 0xc00052e100]
E0320 09:27:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:27:33.409778 543705 memory.go:184] no items to output this cycle
I0320 09:27:33.409800 543705 cpu.go:275] no items to output this cycle
I0320 09:27:38.409740 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 09:27:38.409746 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 09:27:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:27:43.410610 543705 memory.go:191] Add success.
I0320 09:27:43.409793 543705 cpu.go:282] Add success.
I0320 09:27:43.420312 543705 net.go:648] Add success.
I0320 09:27:43.423122 543705 net.go:770] primary dev: ETH0
I0320 09:27:43.423135 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:27:43.423147 543705 net.go:698] Add success.
I0320 09:27:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:27:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:27:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:27:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:27:53.409770 543705 memory.go:184] no items to output this cycle
I0320 09:27:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 09:28:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:28:03.409779 543705 memory.go:184] no items to output this cycle
I0320 09:28:03.409784 543705 cpu.go:275] no items to output this cycle
E0320 09:28:13.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:28:13.409777 543705 memory.go:191] Add success.
W0320 09:28:13.409802 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 09:28:13.409813 543705 cpu.go:282] Add success.
W0320 09:28:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:28:13.409818 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:28:13.420440 543705 net.go:648] Add success.
I0320 09:28:13.423103 543705 net.go:770] primary dev: ETH0
I0320 09:28:13.423115 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:28:13.423126 543705 net.go:698] Add success.
I0320 09:28:14.454953 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:28:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:28:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0320 09:28:14.455171 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:28:14.456525 543705 disk_worker.go:494] system disk:vda1
I0320 09:28:14.456566 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:28:15.455981 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:28:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:28:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:28:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:28:16.472452 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:28:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:28:23.409772 543705 memory.go:184] no items to output this cycle
I0320 09:28:23.409797 543705 cpu.go:275] no items to output this cycle
I0320 09:28:23.641337 543705 disk_info.go:125] begin check local disk info of client
I0320 09:28:23.643935 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:28:23.643940 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b7600 0xc0003b7640]
E0320 09:28:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:28:33.409792 543705 cpu.go:275] no items to output this cycle
I0320 09:28:33.409793 543705 memory.go:184] no items to output this cycle
E0320 09:28:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:28:43.409798 543705 memory.go:191] Add success.
I0320 09:28:43.409808 543705 cpu.go:282] Add success.
I0320 09:28:43.419875 543705 net.go:648] Add success.
I0320 09:28:43.422371 543705 net.go:770] primary dev: ETH0
I0320 09:28:43.422386 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:28:43.422400 543705 net.go:698] Add success.
I0320 09:28:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:28:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:28:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:28:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:28:53.409772 543705 memory.go:184] no items to output this cycle
I0320 09:28:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 09:29:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:29:03.409768 543705 memory.go:184] no items to output this cycle
I0320 09:29:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 09:29:13.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:29:13.409826 543705 memory.go:191] Add success.
I0320 09:29:13.409835 543705 cpu.go:282] Add success.
W0320 09:29:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:29:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:29:13.409878 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:29:13.420101 543705 net.go:648] Add success.
I0320 09:29:13.422837 543705 net.go:770] primary dev: ETH0
I0320 09:29:13.422852 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:29:13.422867 543705 net.go:698] Add success.
I0320 09:29:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:29:14.455364 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:29:14.455382 543705 disk_worker.go:708] disk space is not compliant
W0320 09:29:14.455398 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:29:14.456973 543705 disk_worker.go:494] system disk:vda1
I0320 09:29:14.457001 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:29:15.456023 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:29:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:29:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:29:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:29:16.472466 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:29:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:29:23.409800 543705 memory.go:184] no items to output this cycle
I0320 09:29:23.409813 543705 cpu.go:275] no items to output this cycle
I0320 09:29:23.644416 543705 disk_info.go:125] begin check local disk info of client
I0320 09:29:23.646935 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:29:23.646941 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b100 0xc00007b140]
E0320 09:29:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:29:33.409786 543705 memory.go:184] no items to output this cycle
I0320 09:29:33.409802 543705 cpu.go:275] no items to output this cycle
E0320 09:29:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:29:43.409791 543705 memory.go:191] Add success.
I0320 09:29:43.409794 543705 cpu.go:282] Add success.
I0320 09:29:43.419887 543705 net.go:648] Add success.
I0320 09:29:43.422473 543705 net.go:770] primary dev: ETH0
I0320 09:29:43.422486 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:29:43.422499 543705 net.go:698] Add success.
I0320 09:29:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:29:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:29:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:29:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:29:53.409802 543705 memory.go:184] no items to output this cycle
I0320 09:29:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 09:30:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:30:03.409795 543705 memory.go:184] no items to output this cycle
I0320 09:30:03.409808 543705 cpu.go:275] no items to output this cycle
E0320 09:30:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:30:13.409788 543705 memory.go:191] Add success.
I0320 09:30:13.409789 543705 cpu.go:282] Add success.
W0320 09:30:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:30:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:30:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:30:13.420129 543705 net.go:648] Add success.
I0320 09:30:13.422650 543705 net.go:770] primary dev: ETH0
I0320 09:30:13.422664 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:30:13.422676 543705 net.go:698] Add success.
I0320 09:30:13.469362 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e99a6da0-96ba-4fe4-8295-e602eeed6c56","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 09:30:13.469395 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 09:30:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:30:14.455193 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:30:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0320 09:30:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:30:14.456743 543705 disk_worker.go:494] system disk:vda1
I0320 09:30:14.456772 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:30:15.455985 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:30:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:30:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:30:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:30:16.472452 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:30:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:30:23.409769 543705 memory.go:184] no items to output this cycle
I0320 09:30:23.409801 543705 cpu.go:275] no items to output this cycle
I0320 09:30:23.647370 543705 disk_info.go:125] begin check local disk info of client
I0320 09:30:23.649925 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:30:23.649931 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329000 0xc000329040]
E0320 09:30:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:30:33.409800 543705 memory.go:184] no items to output this cycle
I0320 09:30:33.409818 543705 cpu.go:275] no items to output this cycle
I0320 09:30:38.410797 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 09:30:38.410804 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 09:30:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:30:43.410579 543705 memory.go:191] Add success.
I0320 09:30:43.409808 543705 cpu.go:282] Add success.
I0320 09:30:43.420351 543705 net.go:648] Add success.
I0320 09:30:43.423522 543705 net.go:770] primary dev: ETH0
I0320 09:30:43.423536 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:30:43.423550 543705 net.go:698] Add success.
I0320 09:30:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:30:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:30:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:30:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:30:53.409788 543705 memory.go:184] no items to output this cycle
I0320 09:30:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 09:31:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:31:03.409767 543705 memory.go:184] no items to output this cycle
I0320 09:31:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 09:31:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:31:13.409807 543705 memory.go:191] Add success.
I0320 09:31:13.409819 543705 cpu.go:282] Add success.
W0320 09:31:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:31:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:31:13.409859 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:31:13.420092 543705 net.go:648] Add success.
I0320 09:31:13.422881 543705 net.go:770] primary dev: ETH0
I0320 09:31:13.422894 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:31:13.422906 543705 net.go:698] Add success.
I0320 09:31:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:31:14.455350 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:31:14.455364 543705 disk_worker.go:708] disk space is not compliant
W0320 09:31:14.455367 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:31:14.456720 543705 disk_worker.go:494] system disk:vda1
I0320 09:31:14.456764 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:31:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:31:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:31:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:31:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:31:16.472451 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:31:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:31:23.409765 543705 memory.go:184] no items to output this cycle
I0320 09:31:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 09:31:23.650011 543705 disk_info.go:125] begin check local disk info of client
I0320 09:31:23.652604 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:31:23.652609 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b340 0xc00007b380]
E0320 09:31:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:31:33.409809 543705 memory.go:184] no items to output this cycle
I0320 09:31:33.409823 543705 cpu.go:275] no items to output this cycle
E0320 09:31:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:31:43.409813 543705 memory.go:191] Add success.
I0320 09:31:43.409821 543705 cpu.go:282] Add success.
I0320 09:31:43.419932 543705 net.go:648] Add success.
I0320 09:31:43.422455 543705 net.go:770] primary dev: ETH0
I0320 09:31:43.422470 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:31:43.422482 543705 net.go:698] Add success.
I0320 09:31:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:31:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:31:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:31:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:31:53.409782 543705 cpu.go:275] no items to output this cycle
I0320 09:31:53.409785 543705 memory.go:184] no items to output this cycle
E0320 09:32:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:32:03.409795 543705 memory.go:184] no items to output this cycle
I0320 09:32:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 09:32:13.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:32:13.409774 543705 memory.go:191] Add success.
W0320 09:32:13.409801 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 09:32:13.409806 543705 cpu.go:282] Add success.
W0320 09:32:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:32:13.409818 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:32:13.420031 543705 net.go:648] Add success.
I0320 09:32:13.422814 543705 net.go:770] primary dev: ETH0
I0320 09:32:13.422827 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:32:13.422838 543705 net.go:698] Add success.
W0320 09:32:14.455111 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:32:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0320 09:32:14.455175 543705 disk_worker.go:728] disk inode is not compliant
E0320 09:32:14.456627 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 09:32:14.456635 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 09:32:14.456641 543705 custom_config.go:64] query custom config with name: gpu
I0320 09:32:14.457406 543705 disk_worker.go:494] system disk:vda1
I0320 09:32:14.457435 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 09:32:15.456957 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 09:32:15.456972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:32:16.458092 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:32:16.458155 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:32:16.458174 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:32:16.458171 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 09:32:16.472557 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:32:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:32:23.409782 543705 memory.go:184] no items to output this cycle
I0320 09:32:23.409787 543705 cpu.go:275] no items to output this cycle
I0320 09:32:23.653377 543705 disk_info.go:125] begin check local disk info of client
I0320 09:32:23.655860 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:32:23.655865 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035b4c0 0xc00035b500]
E0320 09:32:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:32:33.409796 543705 memory.go:184] no items to output this cycle
I0320 09:32:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 09:32:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:32:43.409779 543705 memory.go:191] Add success.
I0320 09:32:43.409808 543705 cpu.go:282] Add success.
I0320 09:32:43.419870 543705 net.go:648] Add success.
I0320 09:32:43.422983 543705 net.go:770] primary dev: ETH0
I0320 09:32:43.422996 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:32:43.423009 543705 net.go:698] Add success.
I0320 09:32:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:32:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:32:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:32:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:32:53.409767 543705 memory.go:184] no items to output this cycle
I0320 09:32:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 09:33:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:33:03.409783 543705 cpu.go:275] no items to output this cycle
I0320 09:33:03.409788 543705 memory.go:184] no items to output this cycle
E0320 09:33:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:33:13.409795 543705 memory.go:191] Add success.
I0320 09:33:13.409795 543705 cpu.go:282] Add success.
W0320 09:33:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:33:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:33:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:33:13.420177 543705 net.go:648] Add success.
I0320 09:33:13.422912 543705 net.go:770] primary dev: ETH0
I0320 09:33:13.422926 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:33:13.422939 543705 net.go:698] Add success.
I0320 09:33:13.554359 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4c1a86d5-dc33-40ac-8844-0862c1b78cc8","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 09:33:13.554392 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 09:33:14.453977 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:33:14.454221 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:33:14.454231 543705 disk_worker.go:708] disk space is not compliant
W0320 09:33:14.454233 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:33:14.455756 543705 disk_worker.go:494] system disk:vda1
I0320 09:33:14.455785 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:33:15.455995 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:33:16.457995 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:33:16.458067 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:33:16.458092 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:33:16.472468 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:33:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:33:23.409788 543705 memory.go:184] no items to output this cycle
I0320 09:33:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 09:33:23.656387 543705 disk_info.go:125] begin check local disk info of client
I0320 09:33:23.658928 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:33:23.658934 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5980 0xc0000c59c0]
E0320 09:33:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:33:33.409798 543705 cpu.go:275] no items to output this cycle
I0320 09:33:33.409804 543705 memory.go:184] no items to output this cycle
I0320 09:33:38.411855 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 09:33:38.411861 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 09:33:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:33:43.410674 543705 memory.go:191] Add success.
I0320 09:33:43.409831 543705 cpu.go:282] Add success.
I0320 09:33:43.420427 543705 net.go:648] Add success.
I0320 09:33:43.423252 543705 net.go:770] primary dev: ETH0
I0320 09:33:43.423265 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:33:43.423278 543705 net.go:698] Add success.
I0320 09:33:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:33:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:33:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:33:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:33:53.409786 543705 memory.go:184] no items to output this cycle
I0320 09:33:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 09:34:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:34:03.409807 543705 memory.go:184] no items to output this cycle
I0320 09:34:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 09:34:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:34:13.409785 543705 memory.go:191] Add success.
I0320 09:34:13.409803 543705 cpu.go:282] Add success.
W0320 09:34:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:34:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:34:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:34:13.420115 543705 net.go:648] Add success.
I0320 09:34:13.423127 543705 net.go:770] primary dev: ETH0
I0320 09:34:13.423142 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:34:13.423161 543705 net.go:698] Add success.
I0320 09:34:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:34:14.455309 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:34:14.455319 543705 disk_worker.go:708] disk space is not compliant
W0320 09:34:14.455322 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:34:14.456671 543705 disk_worker.go:494] system disk:vda1
I0320 09:34:14.456713 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:34:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:34:16.457998 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:34:16.458067 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:34:16.458096 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:34:16.472536 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:34:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:34:23.409783 543705 memory.go:184] no items to output this cycle
I0320 09:34:23.409788 543705 cpu.go:275] no items to output this cycle
I0320 09:34:23.659415 543705 disk_info.go:125] begin check local disk info of client
I0320 09:34:23.661987 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:34:23.661992 543705 disk_info.go:196] parse disk info done, disk is : [0xc000493540 0xc000493580]
E0320 09:34:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:34:33.409773 543705 memory.go:184] no items to output this cycle
I0320 09:34:33.409789 543705 cpu.go:275] no items to output this cycle
E0320 09:34:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:34:43.409799 543705 memory.go:191] Add success.
I0320 09:34:43.409801 543705 cpu.go:282] Add success.
I0320 09:34:43.419966 543705 net.go:648] Add success.
I0320 09:34:43.423297 543705 net.go:770] primary dev: ETH0
I0320 09:34:43.423313 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:34:43.423328 543705 net.go:698] Add success.
I0320 09:34:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:34:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:34:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:34:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:34:53.409768 543705 memory.go:184] no items to output this cycle
I0320 09:34:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 09:35:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:35:03.409773 543705 memory.go:184] no items to output this cycle
I0320 09:35:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 09:35:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:35:13.409786 543705 memory.go:191] Add success.
I0320 09:35:13.409805 543705 cpu.go:282] Add success.
W0320 09:35:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:35:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:35:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:35:13.420594 543705 net.go:648] Add success.
I0320 09:35:13.423240 543705 net.go:770] primary dev: ETH0
I0320 09:35:13.423253 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:35:13.423265 543705 net.go:698] Add success.
I0320 09:35:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:35:14.455425 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:35:14.455438 543705 disk_worker.go:708] disk space is not compliant
W0320 09:35:14.455442 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:35:14.457030 543705 disk_worker.go:494] system disk:vda1
I0320 09:35:14.457059 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:35:15.454995 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:35:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:35:16.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:35:16.458085 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:35:16.472459 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:35:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:35:23.409788 543705 memory.go:184] no items to output this cycle
I0320 09:35:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 09:35:23.662725 543705 disk_info.go:125] begin check local disk info of client
I0320 09:35:23.665252 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:35:23.665258 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4140 0xc0000c4180]
E0320 09:35:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:35:33.409781 543705 memory.go:184] no items to output this cycle
I0320 09:35:33.409809 543705 cpu.go:275] no items to output this cycle
E0320 09:35:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:35:43.409809 543705 memory.go:191] Add success.
I0320 09:35:43.409815 543705 cpu.go:282] Add success.
I0320 09:35:43.419853 543705 net.go:648] Add success.
I0320 09:35:43.422786 543705 net.go:770] primary dev: ETH0
I0320 09:35:43.422799 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:35:43.422831 543705 net.go:698] Add success.
I0320 09:35:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:35:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:35:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:35:53.410373 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:35:53.410393 543705 memory.go:184] no items to output this cycle
I0320 09:35:53.410405 543705 cpu.go:275] no items to output this cycle
E0320 09:36:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:36:03.409780 543705 memory.go:184] no items to output this cycle
I0320 09:36:03.409789 543705 cpu.go:275] no items to output this cycle
W0320 09:36:13.409706 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:36:13.409725 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:36:13.409730 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:36:13.409791 543705 cpu.go:282] Add success.
E0320 09:36:13.409814 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:36:13.409837 543705 memory.go:191] Add success.
I0320 09:36:13.420179 543705 net.go:648] Add success.
I0320 09:36:13.422974 543705 net.go:770] primary dev: ETH0
I0320 09:36:13.422987 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:36:13.423000 543705 net.go:698] Add success.
I0320 09:36:13.469801 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"afbef233-f1ff-45f4-b3bb-5f4fd620bf63","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 09:36:13.469832 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 09:36:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:36:14.455160 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:36:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0320 09:36:14.455173 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:36:14.456550 543705 disk_worker.go:494] system disk:vda1
I0320 09:36:14.456603 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:36:15.455650 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:36:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:36:16.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:36:16.458082 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:36:16.472495 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:36:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:36:23.409794 543705 memory.go:184] no items to output this cycle
I0320 09:36:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 09:36:23.665669 543705 disk_info.go:125] begin check local disk info of client
I0320 09:36:23.668240 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:36:23.668245 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa680 0xc0001fa6c0]
E0320 09:36:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:36:33.409799 543705 memory.go:184] no items to output this cycle
I0320 09:36:33.409809 543705 cpu.go:275] no items to output this cycle
I0320 09:36:38.412852 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 09:36:38.412859 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 09:36:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:36:43.410764 543705 memory.go:191] Add success.
I0320 09:36:43.409803 543705 cpu.go:282] Add success.
I0320 09:36:43.420603 543705 net.go:648] Add success.
I0320 09:36:43.423443 543705 net.go:770] primary dev: ETH0
I0320 09:36:43.423456 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:36:43.423470 543705 net.go:698] Add success.
I0320 09:36:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:36:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:36:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:36:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:36:53.409779 543705 cpu.go:275] no items to output this cycle
I0320 09:36:53.409789 543705 memory.go:184] no items to output this cycle
E0320 09:37:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:37:03.409794 543705 memory.go:184] no items to output this cycle
I0320 09:37:03.409809 543705 cpu.go:275] no items to output this cycle
W0320 09:37:13.409708 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:37:13.409727 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:37:13.409732 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:37:13.409782 543705 cpu.go:282] Add success.
E0320 09:37:13.409827 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:37:13.409854 543705 memory.go:191] Add success.
I0320 09:37:13.419998 543705 net.go:648] Add success.
I0320 09:37:13.423041 543705 net.go:770] primary dev: ETH0
I0320 09:37:13.423057 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:37:13.423071 543705 net.go:698] Add success.
I0320 09:37:13.453627 543705 event_worker.go:152] Polling the log file for events...
W0320 09:37:14.455283 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:37:14.455361 543705 disk_worker.go:708] disk space is not compliant
W0320 09:37:14.455365 543705 disk_worker.go:728] disk inode is not compliant
E0320 09:37:14.456555 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 09:37:14.456564 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 09:37:14.456570 543705 custom_config.go:64] query custom config with name: gpu
I0320 09:37:14.457444 543705 disk_worker.go:494] system disk:vda1
I0320 09:37:14.457486 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 09:37:15.456779 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 09:37:15.456789 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:37:16.458099 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 09:37:16.458144 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 09:37:16.458169 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:37:16.458191 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:37:16.472583 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:37:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:37:23.409803 543705 memory.go:184] no items to output this cycle
I0320 09:37:23.409817 543705 cpu.go:275] no items to output this cycle
I0320 09:37:23.668325 543705 disk_info.go:125] begin check local disk info of client
I0320 09:37:23.670848 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:37:23.670854 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001c6040 0xc0001c60c0]
E0320 09:37:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:37:33.409780 543705 memory.go:184] no items to output this cycle
I0320 09:37:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 09:37:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:37:43.409814 543705 memory.go:191] Add success.
I0320 09:37:43.409821 543705 cpu.go:282] Add success.
I0320 09:37:43.419874 543705 net.go:648] Add success.
I0320 09:37:43.422790 543705 net.go:770] primary dev: ETH0
I0320 09:37:43.422803 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:37:43.422817 543705 net.go:698] Add success.
I0320 09:37:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:37:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:37:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:37:53.410230 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:37:53.410256 543705 memory.go:184] no items to output this cycle
I0320 09:37:53.410284 543705 cpu.go:275] no items to output this cycle
E0320 09:38:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:38:03.409792 543705 memory.go:184] no items to output this cycle
I0320 09:38:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 09:38:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:38:13.409820 543705 memory.go:191] Add success.
I0320 09:38:13.409823 543705 cpu.go:282] Add success.
W0320 09:38:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:38:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:38:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:38:13.420164 543705 net.go:648] Add success.
I0320 09:38:13.422986 543705 net.go:770] primary dev: ETH0
I0320 09:38:13.423004 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:38:13.423017 543705 net.go:698] Add success.
I0320 09:38:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:38:14.455158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:38:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0320 09:38:14.455171 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:38:14.456524 543705 disk_worker.go:494] system disk:vda1
I0320 09:38:14.456565 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:38:15.456016 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:38:16.457993 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:38:16.458066 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:38:16.458096 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:38:16.472549 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:38:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:38:23.409799 543705 memory.go:184] no items to output this cycle
I0320 09:38:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 09:38:23.671491 543705 disk_info.go:125] begin check local disk info of client
I0320 09:38:23.674040 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:38:23.674046 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b3680 0xc0002b36c0]
I0320 09:38:33.409797 543705 cpu.go:275] no items to output this cycle
E0320 09:38:33.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:38:33.409816 543705 memory.go:184] no items to output this cycle
E0320 09:38:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:38:43.409787 543705 memory.go:191] Add success.
I0320 09:38:43.409811 543705 cpu.go:282] Add success.
I0320 09:38:43.419839 543705 net.go:648] Add success.
I0320 09:38:43.422644 543705 net.go:770] primary dev: ETH0
I0320 09:38:43.422664 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:38:43.422682 543705 net.go:698] Add success.
I0320 09:38:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:38:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:38:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:38:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:38:53.409797 543705 memory.go:184] no items to output this cycle
I0320 09:38:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 09:39:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:39:03.409782 543705 memory.go:184] no items to output this cycle
I0320 09:39:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 09:39:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:39:13.409781 543705 memory.go:191] Add success.
W0320 09:39:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 09:39:13.409813 543705 cpu.go:282] Add success.
W0320 09:39:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:39:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:39:13.420142 543705 net.go:648] Add success.
I0320 09:39:13.423078 543705 net.go:770] primary dev: ETH0
I0320 09:39:13.423091 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:39:13.423103 543705 net.go:698] Add success.
I0320 09:39:13.487646 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e0b149c8-470b-458f-9f31-09760360f7eb","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 09:39:13.487681 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 09:39:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:39:14.455088 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:39:14.455149 543705 disk_worker.go:708] disk space is not compliant
W0320 09:39:14.455152 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:39:14.456537 543705 disk_worker.go:494] system disk:vda1
I0320 09:39:14.456587 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:39:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:39:16.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:39:16.458063 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:39:16.458091 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:39:16.472462 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:39:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:39:23.409789 543705 cpu.go:275] no items to output this cycle
I0320 09:39:23.409789 543705 memory.go:184] no items to output this cycle
I0320 09:39:23.674728 543705 disk_info.go:125] begin check local disk info of client
I0320 09:39:23.677297 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:39:23.677303 543705 disk_info.go:196] parse disk info done, disk is : [0xc000299380 0xc0002993c0]
E0320 09:39:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:39:33.409783 543705 memory.go:184] no items to output this cycle
I0320 09:39:33.409815 543705 cpu.go:275] no items to output this cycle
I0320 09:39:38.413858 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 09:39:38.413865 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 09:39:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:39:43.410626 543705 memory.go:191] Add success.
I0320 09:39:43.409806 543705 cpu.go:282] Add success.
I0320 09:39:43.420411 543705 net.go:648] Add success.
I0320 09:39:43.423199 543705 net.go:770] primary dev: ETH0
I0320 09:39:43.423215 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:39:43.423231 543705 net.go:698] Add success.
I0320 09:39:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:39:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:39:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:39:53.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:39:53.409811 543705 memory.go:184] no items to output this cycle
I0320 09:39:53.409821 543705 cpu.go:275] no items to output this cycle
E0320 09:40:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:40:03.409772 543705 memory.go:184] no items to output this cycle
I0320 09:40:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 09:40:13.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:40:13.409825 543705 memory.go:191] Add success.
I0320 09:40:13.409833 543705 cpu.go:282] Add success.
W0320 09:40:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:40:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:40:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:40:13.420128 543705 net.go:648] Add success.
I0320 09:40:13.422852 543705 net.go:770] primary dev: ETH0
I0320 09:40:13.422865 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:40:13.422878 543705 net.go:698] Add success.
I0320 09:40:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:40:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:40:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 09:40:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:40:14.456579 543705 disk_worker.go:494] system disk:vda1
I0320 09:40:14.456611 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:40:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:40:16.457996 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:40:16.458075 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:40:16.458103 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:40:16.472492 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:40:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:40:23.409790 543705 memory.go:184] no items to output this cycle
I0320 09:40:23.409800 543705 cpu.go:275] no items to output this cycle
I0320 09:40:23.677672 543705 disk_info.go:125] begin check local disk info of client
I0320 09:40:23.680218 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:40:23.680223 543705 disk_info.go:196] parse disk info done, disk is : [0xc00027b6c0 0xc00027b700]
E0320 09:40:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:40:33.409803 543705 memory.go:184] no items to output this cycle
I0320 09:40:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 09:40:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:40:43.409807 543705 memory.go:191] Add success.
I0320 09:40:43.409810 543705 cpu.go:282] Add success.
I0320 09:40:43.419996 543705 net.go:648] Add success.
I0320 09:40:43.423325 543705 net.go:770] primary dev: ETH0
I0320 09:40:43.423339 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:40:43.423354 543705 net.go:698] Add success.
I0320 09:40:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:40:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:40:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:40:53.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:40:53.409809 543705 memory.go:184] no items to output this cycle
I0320 09:40:53.409819 543705 cpu.go:275] no items to output this cycle
E0320 09:41:03.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:41:03.409818 543705 memory.go:184] no items to output this cycle
I0320 09:41:03.409831 543705 cpu.go:275] no items to output this cycle
E0320 09:41:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:41:13.409818 543705 memory.go:191] Add success.
I0320 09:41:13.409823 543705 cpu.go:282] Add success.
W0320 09:41:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:41:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:41:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:41:13.420137 543705 net.go:648] Add success.
I0320 09:41:13.422658 543705 net.go:770] primary dev: ETH0
I0320 09:41:13.422676 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:41:13.422691 543705 net.go:698] Add success.
I0320 09:41:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:41:14.455099 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:41:14.455160 543705 disk_worker.go:708] disk space is not compliant
W0320 09:41:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:41:14.456493 543705 disk_worker.go:494] system disk:vda1
I0320 09:41:14.456535 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:41:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:41:16.457996 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:41:16.458070 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:41:16.458097 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:41:16.472525 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:41:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:41:23.409776 543705 memory.go:184] no items to output this cycle
I0320 09:41:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 09:41:23.681551 543705 disk_info.go:125] begin check local disk info of client
I0320 09:41:23.684110 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:41:23.684115 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c55c0 0xc0000c5600]
E0320 09:41:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:41:33.409819 543705 memory.go:184] no items to output this cycle
I0320 09:41:33.409836 543705 cpu.go:275] no items to output this cycle
E0320 09:41:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:41:43.409798 543705 cpu.go:282] Add success.
I0320 09:41:43.409803 543705 memory.go:191] Add success.
I0320 09:41:43.419970 543705 net.go:648] Add success.
I0320 09:41:43.422733 543705 net.go:770] primary dev: ETH0
I0320 09:41:43.422746 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:41:43.422758 543705 net.go:698] Add success.
I0320 09:41:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:41:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:41:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:41:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:41:53.409784 543705 memory.go:184] no items to output this cycle
I0320 09:41:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 09:42:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:42:03.409779 543705 memory.go:184] no items to output this cycle
I0320 09:42:03.409781 543705 cpu.go:275] no items to output this cycle
E0320 09:42:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:42:13.409806 543705 memory.go:191] Add success.
I0320 09:42:13.409815 543705 cpu.go:282] Add success.
W0320 09:42:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:42:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:42:13.409851 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:42:13.420116 543705 net.go:648] Add success.
I0320 09:42:13.422944 543705 net.go:770] primary dev: ETH0
I0320 09:42:13.422958 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:42:13.422976 543705 net.go:698] Add success.
I0320 09:42:13.520710 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"80bd7b86-16d8-4bfe-bb9a-766b8c1730d4","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 09:42:13.520742 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 09:42:14.455096 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:42:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0320 09:42:14.455161 543705 disk_worker.go:728] disk inode is not compliant
E0320 09:42:14.456154 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 09:42:14.456164 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 09:42:14.456170 543705 custom_config.go:64] query custom config with name: gpu
I0320 09:42:14.456442 543705 disk_worker.go:494] system disk:vda1
I0320 09:42:14.456497 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 09:42:15.456816 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 09:42:15.456824 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:42:16.458090 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:42:16.458149 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:42:16.458167 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:42:16.458189 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 09:42:16.472564 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:42:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:42:23.409769 543705 memory.go:184] no items to output this cycle
I0320 09:42:23.409806 543705 cpu.go:275] no items to output this cycle
I0320 09:42:23.684584 543705 disk_info.go:125] begin check local disk info of client
I0320 09:42:23.687052 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:42:23.687058 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003cfa00 0xc0003cfa40]
E0320 09:42:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:42:33.409771 543705 memory.go:184] no items to output this cycle
I0320 09:42:33.409792 543705 cpu.go:275] no items to output this cycle
I0320 09:42:38.414863 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 09:42:38.414871 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 09:42:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:42:43.410819 543705 memory.go:191] Add success.
I0320 09:42:43.409827 543705 cpu.go:282] Add success.
I0320 09:42:43.420565 543705 net.go:648] Add success.
I0320 09:42:43.423541 543705 net.go:770] primary dev: ETH0
I0320 09:42:43.423556 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:42:43.423570 543705 net.go:698] Add success.
I0320 09:42:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:42:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:42:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:42:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:42:53.409765 543705 memory.go:184] no items to output this cycle
I0320 09:42:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 09:43:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:43:03.409800 543705 memory.go:184] no items to output this cycle
I0320 09:43:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 09:43:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:43:13.409782 543705 memory.go:191] Add success.
I0320 09:43:13.409803 543705 cpu.go:282] Add success.
W0320 09:43:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:43:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:43:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:43:13.420146 543705 net.go:648] Add success.
I0320 09:43:13.423036 543705 net.go:770] primary dev: ETH0
I0320 09:43:13.423052 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:43:13.423067 543705 net.go:698] Add success.
I0320 09:43:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:43:14.455160 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:43:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0320 09:43:14.455173 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:43:14.456512 543705 disk_worker.go:494] system disk:vda1
I0320 09:43:14.456557 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:43:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:43:16.457994 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:43:16.458074 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:43:16.458103 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:43:16.472483 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:43:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:43:23.409771 543705 memory.go:184] no items to output this cycle
I0320 09:43:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 09:43:23.687588 543705 disk_info.go:125] begin check local disk info of client
I0320 09:43:23.690166 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:43:23.690171 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004933c0 0xc000493400]
E0320 09:43:33.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:43:33.409811 543705 memory.go:184] no items to output this cycle
I0320 09:43:33.409826 543705 cpu.go:275] no items to output this cycle
E0320 09:43:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:43:43.409791 543705 memory.go:191] Add success.
I0320 09:43:43.409810 543705 cpu.go:282] Add success.
I0320 09:43:43.419972 543705 net.go:648] Add success.
I0320 09:43:43.422894 543705 net.go:770] primary dev: ETH0
I0320 09:43:43.422909 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:43:43.422926 543705 net.go:698] Add success.
I0320 09:43:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:43:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:43:46.458090 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:43:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:43:53.409782 543705 memory.go:184] no items to output this cycle
I0320 09:43:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 09:44:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:44:03.409777 543705 memory.go:184] no items to output this cycle
I0320 09:44:03.409782 543705 cpu.go:275] no items to output this cycle
E0320 09:44:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:44:13.409777 543705 memory.go:191] Add success.
W0320 09:44:13.409802 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 09:44:13.409802 543705 cpu.go:282] Add success.
W0320 09:44:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:44:13.409817 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:44:13.420187 543705 net.go:648] Add success.
I0320 09:44:13.422782 543705 net.go:770] primary dev: ETH0
I0320 09:44:13.422797 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:44:13.422808 543705 net.go:698] Add success.
I0320 09:44:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:44:14.455133 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:44:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0320 09:44:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:44:14.456600 543705 disk_worker.go:494] system disk:vda1
I0320 09:44:14.456629 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:44:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:44:16.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:44:16.458074 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:44:16.458102 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:44:16.472469 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:44:23.409875 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:44:23.409885 543705 cpu.go:275] no items to output this cycle
I0320 09:44:23.409896 543705 memory.go:184] no items to output this cycle
I0320 09:44:23.691035 543705 disk_info.go:125] begin check local disk info of client
I0320 09:44:23.693564 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:44:23.693570 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003c14c0 0xc0003c1500]
E0320 09:44:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:44:33.409796 543705 memory.go:184] no items to output this cycle
I0320 09:44:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 09:44:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:44:43.409825 543705 memory.go:191] Add success.
I0320 09:44:43.409835 543705 cpu.go:282] Add success.
I0320 09:44:43.420022 543705 net.go:648] Add success.
I0320 09:44:43.422673 543705 net.go:770] primary dev: ETH0
I0320 09:44:43.422686 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:44:43.422699 543705 net.go:698] Add success.
I0320 09:44:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:44:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:44:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:44:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:44:53.409794 543705 memory.go:184] no items to output this cycle
I0320 09:44:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 09:45:03.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:45:03.409764 543705 memory.go:184] no items to output this cycle
I0320 09:45:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 09:45:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:45:13.409816 543705 memory.go:191] Add success.
I0320 09:45:13.409820 543705 cpu.go:282] Add success.
W0320 09:45:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:45:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:45:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:45:13.420135 543705 net.go:648] Add success.
I0320 09:45:13.422892 543705 net.go:770] primary dev: ETH0
I0320 09:45:13.422907 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:45:13.422920 543705 net.go:698] Add success.
I0320 09:45:13.469164 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"94dec404-846f-4705-9345-00a41e4242be","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 09:45:13.469195 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 09:45:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:45:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:45:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0320 09:45:14.455176 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:45:14.456504 543705 disk_worker.go:494] system disk:vda1
I0320 09:45:14.456534 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:45:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:45:16.458011 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:45:16.458084 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:45:16.458113 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:45:16.472564 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:45:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:45:23.409798 543705 memory.go:184] no items to output this cycle
I0320 09:45:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 09:45:23.693663 543705 disk_info.go:125] begin check local disk info of client
I0320 09:45:23.696225 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:45:23.696231 543705 disk_info.go:196] parse disk info done, disk is : [0xc00049f740 0xc00049f780]
E0320 09:45:33.409940 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:45:33.410027 543705 memory.go:184] no items to output this cycle
I0320 09:45:33.410029 543705 cpu.go:275] no items to output this cycle
I0320 09:45:38.415856 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 09:45:38.415864 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 09:45:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:45:43.410630 543705 memory.go:191] Add success.
I0320 09:45:43.409810 543705 cpu.go:282] Add success.
I0320 09:45:43.420408 543705 net.go:648] Add success.
I0320 09:45:43.423037 543705 net.go:770] primary dev: ETH0
I0320 09:45:43.423049 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:45:43.423062 543705 net.go:698] Add success.
I0320 09:45:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:45:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:45:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:45:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:45:53.409798 543705 memory.go:184] no items to output this cycle
I0320 09:45:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 09:46:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:46:03.409776 543705 memory.go:184] no items to output this cycle
I0320 09:46:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 09:46:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:46:13.409811 543705 memory.go:191] Add success.
I0320 09:46:13.409819 543705 cpu.go:282] Add success.
W0320 09:46:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:46:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:46:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:46:13.420266 543705 net.go:648] Add success.
I0320 09:46:13.423111 543705 net.go:770] primary dev: ETH0
I0320 09:46:13.423124 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:46:13.423136 543705 net.go:698] Add success.
I0320 09:46:14.454981 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:46:14.455187 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:46:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 09:46:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:46:14.456584 543705 disk_worker.go:494] system disk:vda1
I0320 09:46:14.456616 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:46:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:46:16.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:46:16.458062 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:46:16.458088 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:46:16.472528 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:46:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:46:23.409780 543705 memory.go:184] no items to output this cycle
I0320 09:46:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 09:46:23.697599 543705 disk_info.go:125] begin check local disk info of client
I0320 09:46:23.700170 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:46:23.700176 543705 disk_info.go:196] parse disk info done, disk is : [0xc00046bb40 0xc00046bb80]
E0320 09:46:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:46:33.409765 543705 memory.go:184] no items to output this cycle
I0320 09:46:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 09:46:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:46:43.409812 543705 memory.go:191] Add success.
I0320 09:46:43.409814 543705 cpu.go:282] Add success.
I0320 09:46:43.419964 543705 net.go:648] Add success.
I0320 09:46:43.422874 543705 net.go:770] primary dev: ETH0
I0320 09:46:43.422887 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:46:43.422899 543705 net.go:698] Add success.
I0320 09:46:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:46:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:46:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:46:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:46:53.409764 543705 memory.go:184] no items to output this cycle
I0320 09:46:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 09:47:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:47:03.409765 543705 memory.go:184] no items to output this cycle
I0320 09:47:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 09:47:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:47:13.409783 543705 memory.go:191] Add success.
I0320 09:47:13.409805 543705 cpu.go:282] Add success.
W0320 09:47:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:47:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:47:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:47:13.420173 543705 net.go:648] Add success.
I0320 09:47:13.422869 543705 net.go:770] primary dev: ETH0
I0320 09:47:13.422882 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:47:13.422894 543705 net.go:698] Add success.
I0320 09:47:13.453440 543705 event_worker.go:152] Polling the log file for events...
W0320 09:47:14.455093 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:47:14.455153 543705 disk_worker.go:708] disk space is not compliant
W0320 09:47:14.455156 543705 disk_worker.go:728] disk inode is not compliant
E0320 09:47:14.456881 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 09:47:14.456889 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 09:47:14.456895 543705 custom_config.go:64] query custom config with name: gpu
I0320 09:47:14.456964 543705 disk_worker.go:494] system disk:vda1
I0320 09:47:14.457007 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 09:47:15.456833 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 09:47:15.456842 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:47:16.458097 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:47:16.458173 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
E0320 09:47:16.458170 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 09:47:16.458193 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:47:16.472592 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:47:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:47:23.409787 543705 memory.go:184] no items to output this cycle
I0320 09:47:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 09:47:23.700654 543705 disk_info.go:125] begin check local disk info of client
I0320 09:47:23.703176 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:47:23.703182 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004c3280 0xc0004c32c0]
E0320 09:47:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:47:33.409814 543705 memory.go:184] no items to output this cycle
I0320 09:47:33.409827 543705 cpu.go:275] no items to output this cycle
E0320 09:47:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:47:43.409790 543705 memory.go:191] Add success.
I0320 09:47:43.409808 543705 cpu.go:282] Add success.
I0320 09:47:43.419973 543705 net.go:648] Add success.
I0320 09:47:43.423300 543705 net.go:770] primary dev: ETH0
I0320 09:47:43.423313 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:47:43.423325 543705 net.go:698] Add success.
I0320 09:47:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:47:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:47:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:47:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:47:53.409799 543705 memory.go:184] no items to output this cycle
I0320 09:47:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 09:48:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:48:03.409777 543705 memory.go:184] no items to output this cycle
I0320 09:48:03.409781 543705 cpu.go:275] no items to output this cycle
E0320 09:48:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:48:13.409787 543705 memory.go:191] Add success.
I0320 09:48:13.409786 543705 cpu.go:282] Add success.
W0320 09:48:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:48:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:48:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:48:13.420107 543705 net.go:648] Add success.
I0320 09:48:13.423251 543705 net.go:770] primary dev: ETH0
I0320 09:48:13.423263 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:48:13.423275 543705 net.go:698] Add success.
I0320 09:48:13.469748 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"dcd07bab-512d-427f-a97e-93b195b3e2fe","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 09:48:13.469782 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 09:48:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:48:14.455214 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:48:14.455225 543705 disk_worker.go:708] disk space is not compliant
W0320 09:48:14.455227 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:48:14.456782 543705 disk_worker.go:494] system disk:vda1
I0320 09:48:14.456813 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:48:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:48:16.458005 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:48:16.458082 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:48:16.458116 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:48:16.472564 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:48:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:48:23.409771 543705 memory.go:184] no items to output this cycle
I0320 09:48:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 09:48:23.703649 543705 disk_info.go:125] begin check local disk info of client
I0320 09:48:23.706244 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:48:23.706250 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a8140 0xc0004a8180]
E0320 09:48:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:48:33.409771 543705 memory.go:184] no items to output this cycle
I0320 09:48:33.409804 543705 cpu.go:275] no items to output this cycle
I0320 09:48:38.416861 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 09:48:38.416868 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 09:48:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:48:43.410600 543705 memory.go:191] Add success.
I0320 09:48:43.409808 543705 cpu.go:282] Add success.
I0320 09:48:43.420385 543705 net.go:648] Add success.
I0320 09:48:43.423092 543705 net.go:770] primary dev: ETH0
I0320 09:48:43.423107 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:48:43.423122 543705 net.go:698] Add success.
I0320 09:48:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:48:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:48:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:48:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:48:53.409777 543705 memory.go:184] no items to output this cycle
I0320 09:48:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 09:49:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:49:03.409775 543705 memory.go:184] no items to output this cycle
I0320 09:49:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 09:49:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:49:13.409810 543705 memory.go:191] Add success.
I0320 09:49:13.409821 543705 cpu.go:282] Add success.
W0320 09:49:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:49:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:49:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:49:13.420306 543705 net.go:648] Add success.
I0320 09:49:13.422920 543705 net.go:770] primary dev: ETH0
I0320 09:49:13.422933 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:49:13.422944 543705 net.go:698] Add success.
I0320 09:49:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:49:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:49:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 09:49:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:49:14.456615 543705 disk_worker.go:494] system disk:vda1
I0320 09:49:14.456646 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:49:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:49:16.458026 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:49:16.458108 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:49:16.458146 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:49:16.472612 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:49:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:49:23.409780 543705 memory.go:184] no items to output this cycle
I0320 09:49:23.409800 543705 cpu.go:275] no items to output this cycle
I0320 09:49:23.707683 543705 disk_info.go:125] begin check local disk info of client
I0320 09:49:23.710198 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:49:23.710204 543705 disk_info.go:196] parse disk info done, disk is : [0xc000375ac0 0xc000375b00]
E0320 09:49:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:49:33.409796 543705 memory.go:184] no items to output this cycle
I0320 09:49:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 09:49:43.409837 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:49:43.409865 543705 memory.go:191] Add success.
I0320 09:49:43.409970 543705 cpu.go:282] Add success.
I0320 09:49:43.419718 543705 net.go:648] Add success.
I0320 09:49:43.422259 543705 net.go:770] primary dev: ETH0
I0320 09:49:43.422274 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:49:43.422287 543705 net.go:698] Add success.
I0320 09:49:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:49:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:49:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:49:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:49:53.409780 543705 memory.go:184] no items to output this cycle
I0320 09:49:53.409782 543705 cpu.go:275] no items to output this cycle
E0320 09:50:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:50:03.409773 543705 memory.go:184] no items to output this cycle
I0320 09:50:03.409780 543705 cpu.go:275] no items to output this cycle
E0320 09:50:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:50:13.409792 543705 memory.go:191] Add success.
I0320 09:50:13.409795 543705 cpu.go:282] Add success.
W0320 09:50:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:50:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:50:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:50:13.420226 543705 net.go:648] Add success.
I0320 09:50:13.422877 543705 net.go:770] primary dev: ETH0
I0320 09:50:13.422890 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:50:13.422902 543705 net.go:698] Add success.
I0320 09:50:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:50:14.455092 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:50:14.455151 543705 disk_worker.go:708] disk space is not compliant
W0320 09:50:14.455153 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:50:14.456489 543705 disk_worker.go:494] system disk:vda1
I0320 09:50:14.456534 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:50:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:50:16.458039 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:50:16.458114 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:50:16.458153 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:50:16.472757 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:50:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:50:23.409778 543705 memory.go:184] no items to output this cycle
I0320 09:50:23.409806 543705 cpu.go:275] no items to output this cycle
I0320 09:50:23.710726 543705 disk_info.go:125] begin check local disk info of client
I0320 09:50:23.713252 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:50:23.713257 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b86c0 0xc0003b8700]
E0320 09:50:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:50:33.409787 543705 memory.go:184] no items to output this cycle
I0320 09:50:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 09:50:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:50:43.409806 543705 memory.go:191] Add success.
I0320 09:50:43.409810 543705 cpu.go:282] Add success.
I0320 09:50:43.419964 543705 net.go:648] Add success.
I0320 09:50:43.422451 543705 net.go:770] primary dev: ETH0
I0320 09:50:43.422464 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:50:43.422476 543705 net.go:698] Add success.
I0320 09:50:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:50:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:50:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:50:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:50:53.409765 543705 memory.go:184] no items to output this cycle
I0320 09:50:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 09:51:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:51:03.409798 543705 memory.go:184] no items to output this cycle
I0320 09:51:03.409808 543705 cpu.go:275] no items to output this cycle
E0320 09:51:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:51:13.409815 543705 memory.go:191] Add success.
I0320 09:51:13.409827 543705 cpu.go:282] Add success.
W0320 09:51:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:51:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:51:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:51:13.420130 543705 net.go:648] Add success.
I0320 09:51:13.422863 543705 net.go:770] primary dev: ETH0
I0320 09:51:13.422876 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:51:13.422887 543705 net.go:698] Add success.
I0320 09:51:13.469736 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0e99289a-ce33-4015-8fa1-906ad0561e1f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 09:51:13.469770 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 09:51:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:51:14.455145 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:51:14.455157 543705 disk_worker.go:708] disk space is not compliant
W0320 09:51:14.455160 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:51:14.456495 543705 disk_worker.go:494] system disk:vda1
I0320 09:51:14.456524 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:51:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:51:16.458035 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:51:16.458117 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:51:16.458153 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:51:16.472621 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:51:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:51:23.409779 543705 memory.go:184] no items to output this cycle
I0320 09:51:23.409800 543705 cpu.go:275] no items to output this cycle
I0320 09:51:23.713672 543705 disk_info.go:125] begin check local disk info of client
I0320 09:51:23.716190 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:51:23.716195 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004aca80 0xc0004acac0]
E0320 09:51:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:51:33.409795 543705 memory.go:184] no items to output this cycle
I0320 09:51:33.409801 543705 cpu.go:275] no items to output this cycle
I0320 09:51:38.417846 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 09:51:38.417853 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 09:51:43.409884 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:51:43.410770 543705 memory.go:191] Add success.
I0320 09:51:43.409917 543705 cpu.go:282] Add success.
I0320 09:51:43.419736 543705 net.go:648] Add success.
I0320 09:51:43.422432 543705 net.go:770] primary dev: ETH0
I0320 09:51:43.422447 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:51:43.422460 543705 net.go:698] Add success.
I0320 09:51:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:51:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:51:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:51:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:51:53.409777 543705 memory.go:184] no items to output this cycle
I0320 09:51:53.409779 543705 cpu.go:275] no items to output this cycle
E0320 09:52:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:52:03.409777 543705 memory.go:184] no items to output this cycle
I0320 09:52:03.409782 543705 cpu.go:275] no items to output this cycle
E0320 09:52:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:52:13.409781 543705 cpu.go:282] Add success.
I0320 09:52:13.409782 543705 memory.go:191] Add success.
W0320 09:52:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:52:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:52:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:52:13.420206 543705 net.go:648] Add success.
I0320 09:52:13.422911 543705 net.go:770] primary dev: ETH0
I0320 09:52:13.422923 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:52:13.422936 543705 net.go:698] Add success.
W0320 09:52:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:52:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0320 09:52:14.455183 543705 disk_worker.go:728] disk inode is not compliant
E0320 09:52:14.455888 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 09:52:14.455897 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 09:52:14.455903 543705 custom_config.go:64] query custom config with name: gpu
I0320 09:52:14.456628 543705 disk_worker.go:494] system disk:vda1
I0320 09:52:14.456671 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 09:52:15.456831 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 09:52:15.456843 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:52:16.458103 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 09:52:16.458150 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 09:52:16.458181 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:52:16.458205 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:52:16.472634 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:52:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:52:23.409807 543705 memory.go:184] no items to output this cycle
I0320 09:52:23.409817 543705 cpu.go:275] no items to output this cycle
I0320 09:52:23.716716 543705 disk_info.go:125] begin check local disk info of client
I0320 09:52:23.719251 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:52:23.719256 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032e2c0 0xc00032e300]
E0320 09:52:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:52:33.409809 543705 memory.go:184] no items to output this cycle
I0320 09:52:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 09:52:43.409853 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:52:43.409888 543705 memory.go:191] Add success.
I0320 09:52:43.409981 543705 cpu.go:282] Add success.
I0320 09:52:43.419748 543705 net.go:648] Add success.
I0320 09:52:43.422488 543705 net.go:770] primary dev: ETH0
I0320 09:52:43.422501 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:52:43.422513 543705 net.go:698] Add success.
I0320 09:52:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:52:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:52:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:52:53.410268 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:52:53.410287 543705 memory.go:184] no items to output this cycle
I0320 09:52:53.410298 543705 cpu.go:275] no items to output this cycle
E0320 09:53:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:53:03.409771 543705 memory.go:184] no items to output this cycle
I0320 09:53:03.409779 543705 cpu.go:275] no items to output this cycle
E0320 09:53:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:53:13.409809 543705 memory.go:191] Add success.
I0320 09:53:13.409816 543705 cpu.go:282] Add success.
W0320 09:53:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:53:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:53:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:53:13.420109 543705 net.go:648] Add success.
I0320 09:53:13.422782 543705 net.go:770] primary dev: ETH0
I0320 09:53:13.422796 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:53:13.422811 543705 net.go:698] Add success.
I0320 09:53:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:53:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:53:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 09:53:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:53:14.456594 543705 disk_worker.go:494] system disk:vda1
I0320 09:53:14.456624 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:53:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:53:16.458015 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:53:16.458090 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:53:16.458122 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:53:16.472525 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:53:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:53:23.409805 543705 memory.go:184] no items to output this cycle
I0320 09:53:23.409815 543705 cpu.go:275] no items to output this cycle
I0320 09:53:23.720730 543705 disk_info.go:125] begin check local disk info of client
I0320 09:53:23.723262 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:53:23.723267 543705 disk_info.go:196] parse disk info done, disk is : [0xc000353ac0 0xc000353b00]
I0320 09:53:33.409807 543705 cpu.go:275] no items to output this cycle
E0320 09:53:33.409806 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:53:33.409828 543705 memory.go:184] no items to output this cycle
E0320 09:53:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:53:43.409797 543705 memory.go:191] Add success.
I0320 09:53:43.409817 543705 cpu.go:282] Add success.
I0320 09:53:43.420024 543705 net.go:648] Add success.
I0320 09:53:43.423103 543705 net.go:770] primary dev: ETH0
I0320 09:53:43.423116 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:53:43.423128 543705 net.go:698] Add success.
I0320 09:53:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:53:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:53:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:53:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:53:53.409782 543705 memory.go:184] no items to output this cycle
I0320 09:53:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 09:54:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:54:03.409776 543705 memory.go:184] no items to output this cycle
I0320 09:54:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 09:54:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:54:13.409809 543705 memory.go:191] Add success.
I0320 09:54:13.409820 543705 cpu.go:282] Add success.
W0320 09:54:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:54:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:54:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:54:13.420269 543705 net.go:648] Add success.
I0320 09:54:13.423105 543705 net.go:770] primary dev: ETH0
I0320 09:54:13.423119 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:54:13.423129 543705 net.go:698] Add success.
I0320 09:54:13.469304 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f3235356-630c-470f-ac97-f875c8a6e35f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 09:54:13.469339 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 09:54:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:54:14.455212 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:54:14.455223 543705 disk_worker.go:708] disk space is not compliant
W0320 09:54:14.455226 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:54:14.456633 543705 disk_worker.go:494] system disk:vda1
I0320 09:54:14.456662 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:54:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:54:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:54:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:54:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:54:16.472469 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:54:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:54:23.409780 543705 memory.go:184] no items to output this cycle
I0320 09:54:23.409806 543705 cpu.go:275] no items to output this cycle
I0320 09:54:23.724760 543705 disk_info.go:125] begin check local disk info of client
I0320 09:54:23.727300 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:54:23.727305 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b200 0xc00007b240]
E0320 09:54:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:54:33.409781 543705 memory.go:184] no items to output this cycle
I0320 09:54:33.409788 543705 cpu.go:275] no items to output this cycle
I0320 09:54:38.418854 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 09:54:38.418861 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 09:54:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:54:43.410805 543705 memory.go:191] Add success.
I0320 09:54:43.409806 543705 cpu.go:282] Add success.
I0320 09:54:43.420715 543705 net.go:648] Add success.
I0320 09:54:43.423254 543705 net.go:770] primary dev: ETH0
I0320 09:54:43.423268 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:54:43.423280 543705 net.go:698] Add success.
I0320 09:54:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:54:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:54:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:54:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:54:53.409798 543705 memory.go:184] no items to output this cycle
I0320 09:54:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 09:55:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:55:03.409776 543705 memory.go:184] no items to output this cycle
I0320 09:55:03.409780 543705 cpu.go:275] no items to output this cycle
E0320 09:55:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:55:13.409788 543705 memory.go:191] Add success.
W0320 09:55:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:55:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:55:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:55:13.409838 543705 cpu.go:282] Add success.
I0320 09:55:13.420142 543705 net.go:648] Add success.
I0320 09:55:13.422479 543705 net.go:770] primary dev: ETH0
I0320 09:55:13.422494 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:55:13.422506 543705 net.go:698] Add success.
I0320 09:55:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:55:14.455129 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:55:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0320 09:55:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:55:14.456558 543705 disk_worker.go:494] system disk:vda1
I0320 09:55:14.456588 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:55:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:55:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:55:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:55:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:55:16.472397 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:55:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:55:23.409785 543705 memory.go:184] no items to output this cycle
I0320 09:55:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 09:55:23.728773 543705 disk_info.go:125] begin check local disk info of client
I0320 09:55:23.731304 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:55:23.731309 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab500 0xc0001ab540]
I0320 09:55:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 09:55:33.409810 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:55:33.409833 543705 memory.go:184] no items to output this cycle
E0320 09:55:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:55:43.409796 543705 memory.go:191] Add success.
I0320 09:55:43.409817 543705 cpu.go:282] Add success.
I0320 09:55:43.419925 543705 net.go:648] Add success.
I0320 09:55:43.422513 543705 net.go:770] primary dev: ETH0
I0320 09:55:43.422527 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:55:43.422539 543705 net.go:698] Add success.
I0320 09:55:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:55:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:55:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:55:53.409929 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:55:53.409946 543705 memory.go:184] no items to output this cycle
I0320 09:55:53.410146 543705 cpu.go:275] no items to output this cycle
E0320 09:56:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:56:03.409804 543705 memory.go:184] no items to output this cycle
I0320 09:56:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 09:56:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:56:13.409829 543705 memory.go:191] Add success.
I0320 09:56:13.409844 543705 cpu.go:282] Add success.
W0320 09:56:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:56:13.409873 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:56:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:56:13.420179 543705 net.go:648] Add success.
I0320 09:56:13.423027 543705 net.go:770] primary dev: ETH0
I0320 09:56:13.423041 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:56:13.423055 543705 net.go:698] Add success.
I0320 09:56:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:56:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:56:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 09:56:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:56:14.456576 543705 disk_worker.go:494] system disk:vda1
I0320 09:56:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:56:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:56:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:56:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:56:16.458049 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:56:16.472367 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:56:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:56:23.409801 543705 memory.go:184] no items to output this cycle
I0320 09:56:23.409805 543705 cpu.go:275] no items to output this cycle
I0320 09:56:23.732785 543705 disk_info.go:125] begin check local disk info of client
I0320 09:56:23.735327 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:56:23.735332 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007af80 0xc00007afc0]
E0320 09:56:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:56:33.409809 543705 memory.go:184] no items to output this cycle
I0320 09:56:33.409826 543705 cpu.go:275] no items to output this cycle
E0320 09:56:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:56:43.409804 543705 memory.go:191] Add success.
I0320 09:56:43.409820 543705 cpu.go:282] Add success.
I0320 09:56:43.419967 543705 net.go:648] Add success.
I0320 09:56:43.422647 543705 net.go:770] primary dev: ETH0
I0320 09:56:43.422662 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:56:43.422676 543705 net.go:698] Add success.
I0320 09:56:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:56:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:56:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:56:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:56:53.409778 543705 memory.go:184] no items to output this cycle
I0320 09:56:53.409783 543705 cpu.go:275] no items to output this cycle
E0320 09:57:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:57:03.409793 543705 memory.go:184] no items to output this cycle
I0320 09:57:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 09:57:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:57:13.409813 543705 memory.go:191] Add success.
I0320 09:57:13.409814 543705 cpu.go:282] Add success.
W0320 09:57:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:57:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:57:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:57:13.420309 543705 net.go:648] Add success.
I0320 09:57:13.423018 543705 net.go:770] primary dev: ETH0
I0320 09:57:13.423033 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:57:13.423047 543705 net.go:698] Add success.
I0320 09:57:13.429693 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 09:57:13.452773 543705 event_worker.go:152] Polling the log file for events...
I0320 09:57:13.469359 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"410f77d6-f791-4728-baad-9f2b7b33aa00","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 09:57:13.469392 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 09:57:14.455119 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:57:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0320 09:57:14.455184 543705 disk_worker.go:728] disk inode is not compliant
E0320 09:57:14.456693 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 09:57:14.456701 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 09:57:14.456704 543705 custom_config.go:64] query custom config with name: gpu
I0320 09:57:14.456919 543705 disk_worker.go:494] system disk:vda1
I0320 09:57:14.456961 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 09:57:15.456811 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 09:57:15.456819 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:57:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 09:57:16.457996 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 09:57:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:57:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:57:16.472373 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:57:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:57:23.409778 543705 memory.go:184] no items to output this cycle
I0320 09:57:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 09:57:23.735413 543705 disk_info.go:125] begin check local disk info of client
I0320 09:57:23.737863 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:57:23.737868 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329380 0xc0003293c0]
I0320 09:57:33.409803 543705 cpu.go:275] no items to output this cycle
E0320 09:57:33.409804 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:57:33.409823 543705 memory.go:184] no items to output this cycle
I0320 09:57:38.419859 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 09:57:38.419867 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 09:57:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:57:43.410694 543705 memory.go:191] Add success.
I0320 09:57:43.409834 543705 cpu.go:282] Add success.
I0320 09:57:43.420409 543705 net.go:648] Add success.
I0320 09:57:43.423244 543705 net.go:770] primary dev: ETH0
I0320 09:57:43.423258 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:57:43.423272 543705 net.go:698] Add success.
I0320 09:57:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:57:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:57:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:57:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:57:53.409773 543705 memory.go:184] no items to output this cycle
I0320 09:57:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 09:58:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:58:03.409781 543705 memory.go:184] no items to output this cycle
I0320 09:58:03.409802 543705 cpu.go:275] no items to output this cycle
E0320 09:58:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:58:13.409790 543705 memory.go:191] Add success.
I0320 09:58:13.409801 543705 cpu.go:282] Add success.
W0320 09:58:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:58:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:58:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:58:13.420138 543705 net.go:648] Add success.
I0320 09:58:13.422719 543705 net.go:770] primary dev: ETH0
I0320 09:58:13.422734 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:58:13.422749 543705 net.go:698] Add success.
I0320 09:58:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:58:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:58:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0320 09:58:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:58:14.456581 543705 disk_worker.go:494] system disk:vda1
I0320 09:58:14.456611 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:58:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:58:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:58:16.458023 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:58:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:58:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:58:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:58:23.409807 543705 memory.go:184] no items to output this cycle
I0320 09:58:23.409813 543705 cpu.go:275] no items to output this cycle
I0320 09:58:23.738725 543705 disk_info.go:125] begin check local disk info of client
I0320 09:58:23.741240 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:58:23.741246 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4d00 0xc0000c4d40]
E0320 09:58:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:58:33.409772 543705 memory.go:184] no items to output this cycle
I0320 09:58:33.409798 543705 cpu.go:275] no items to output this cycle
E0320 09:58:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:58:43.409795 543705 memory.go:191] Add success.
I0320 09:58:43.409800 543705 cpu.go:282] Add success.
I0320 09:58:43.419956 543705 net.go:648] Add success.
I0320 09:58:43.422683 543705 net.go:770] primary dev: ETH0
I0320 09:58:43.422696 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:58:43.422708 543705 net.go:698] Add success.
I0320 09:58:46.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:58:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:58:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:58:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:58:53.409798 543705 memory.go:184] no items to output this cycle
I0320 09:58:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 09:59:03.409871 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:59:03.409889 543705 memory.go:184] no items to output this cycle
I0320 09:59:03.409966 543705 cpu.go:275] no items to output this cycle
E0320 09:59:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:59:13.409821 543705 memory.go:191] Add success.
I0320 09:59:13.409832 543705 cpu.go:282] Add success.
W0320 09:59:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 09:59:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 09:59:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 09:59:13.420309 543705 net.go:648] Add success.
I0320 09:59:13.423134 543705 net.go:770] primary dev: ETH0
I0320 09:59:13.423147 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:59:13.423159 543705 net.go:698] Add success.
I0320 09:59:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 09:59:14.455118 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 09:59:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 09:59:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0320 09:59:14.456565 543705 disk_worker.go:494] system disk:vda1
I0320 09:59:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 09:59:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 09:59:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:59:16.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:59:16.458077 543705 gpu_kunlun.go:227] Add success, len:1
I0320 09:59:16.472397 543705 disk_local_worker.go:436] Get disk info: []
E0320 09:59:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:59:23.409809 543705 memory.go:184] no items to output this cycle
I0320 09:59:23.409816 543705 cpu.go:275] no items to output this cycle
I0320 09:59:23.741669 543705 disk_info.go:125] begin check local disk info of client
I0320 09:59:23.744128 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 09:59:23.744134 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b24c0 0xc0003b2500]
E0320 09:59:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:59:33.409772 543705 memory.go:184] no items to output this cycle
I0320 09:59:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 09:59:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:59:43.409804 543705 cpu.go:282] Add success.
I0320 09:59:43.409806 543705 memory.go:191] Add success.
I0320 09:59:43.419889 543705 net.go:648] Add success.
I0320 09:59:43.422585 543705 net.go:770] primary dev: ETH0
I0320 09:59:43.422597 543705 net.go:802] Send network stats successfully!,count is 6
I0320 09:59:43.422609 543705 net.go:698] Add success.
I0320 09:59:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 09:59:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 09:59:46.458088 543705 gpu_kunlun.go:227] Add success, len:1
E0320 09:59:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 09:59:53.409771 543705 memory.go:184] no items to output this cycle
I0320 09:59:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 10:00:03.409889 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:00:03.409907 543705 memory.go:184] no items to output this cycle
I0320 10:00:03.409923 543705 cpu.go:275] no items to output this cycle
E0320 10:00:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:00:13.409818 543705 memory.go:191] Add success.
I0320 10:00:13.409827 543705 cpu.go:282] Add success.
W0320 10:00:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:00:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:00:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:00:13.420192 543705 net.go:648] Add success.
I0320 10:00:13.422963 543705 net.go:770] primary dev: ETH0
I0320 10:00:13.422977 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:00:13.422990 543705 net.go:698] Add success.
I0320 10:00:13.464537 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1ff8fc91-c517-4e52-9297-653d43b130c3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 10:00:13.464572 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 10:00:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:00:14.455198 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:00:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0320 10:00:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:00:14.456606 543705 disk_worker.go:494] system disk:vda1
I0320 10:00:14.456637 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:00:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:00:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:00:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:00:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:00:16.472436 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:00:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:00:23.409788 543705 cpu.go:275] no items to output this cycle
I0320 10:00:23.409791 543705 memory.go:184] no items to output this cycle
I0320 10:00:23.744214 543705 disk_info.go:125] begin check local disk info of client
I0320 10:00:23.746697 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:00:23.746703 543705 disk_info.go:196] parse disk info done, disk is : [0xc000258ac0 0xc000258b00]
E0320 10:00:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:00:33.409774 543705 memory.go:184] no items to output this cycle
I0320 10:00:33.409795 543705 cpu.go:275] no items to output this cycle
I0320 10:00:38.420861 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 10:00:38.420868 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 10:00:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:00:43.410663 543705 memory.go:191] Add success.
I0320 10:00:43.409800 543705 cpu.go:282] Add success.
I0320 10:00:43.420350 543705 net.go:648] Add success.
I0320 10:00:43.423210 543705 net.go:770] primary dev: ETH0
I0320 10:00:43.423222 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:00:43.423236 543705 net.go:698] Add success.
I0320 10:00:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:00:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:00:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:00:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:00:53.409770 543705 memory.go:184] no items to output this cycle
I0320 10:00:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 10:01:03.409902 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:01:03.409923 543705 memory.go:184] no items to output this cycle
I0320 10:01:03.409961 543705 cpu.go:275] no items to output this cycle
E0320 10:01:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:01:13.409817 543705 memory.go:191] Add success.
I0320 10:01:13.409822 543705 cpu.go:282] Add success.
W0320 10:01:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:01:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:01:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:01:13.420221 543705 net.go:648] Add success.
I0320 10:01:13.423231 543705 net.go:770] primary dev: ETH0
I0320 10:01:13.423244 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:01:13.423256 543705 net.go:698] Add success.
I0320 10:01:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:01:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:01:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0320 10:01:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:01:14.456571 543705 disk_worker.go:494] system disk:vda1
I0320 10:01:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:01:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:01:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:01:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:01:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:01:16.472386 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:01:23.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:01:23.409813 543705 memory.go:184] no items to output this cycle
I0320 10:01:23.409820 543705 cpu.go:275] no items to output this cycle
I0320 10:01:23.747874 543705 disk_info.go:125] begin check local disk info of client
I0320 10:01:23.750449 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:01:23.750454 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4e00 0xc0000c4e40]
E0320 10:01:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:01:33.409765 543705 memory.go:184] no items to output this cycle
I0320 10:01:33.409803 543705 cpu.go:275] no items to output this cycle
E0320 10:01:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:01:43.409802 543705 memory.go:191] Add success.
I0320 10:01:43.409806 543705 cpu.go:282] Add success.
I0320 10:01:43.420060 543705 net.go:648] Add success.
I0320 10:01:43.422477 543705 net.go:770] primary dev: ETH0
I0320 10:01:43.422489 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:01:43.422501 543705 net.go:698] Add success.
I0320 10:01:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:01:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:01:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:01:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:01:53.409766 543705 memory.go:184] no items to output this cycle
I0320 10:01:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 10:02:03.409880 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:02:03.409944 543705 cpu.go:275] no items to output this cycle
I0320 10:02:03.410011 543705 memory.go:184] no items to output this cycle
E0320 10:02:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:02:13.409800 543705 memory.go:191] Add success.
I0320 10:02:13.409807 543705 cpu.go:282] Add success.
W0320 10:02:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:02:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:02:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:02:13.420131 543705 net.go:648] Add success.
I0320 10:02:13.423009 543705 net.go:770] primary dev: ETH0
I0320 10:02:13.423021 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:02:13.423033 543705 net.go:698] Add success.
W0320 10:02:14.455106 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:02:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0320 10:02:14.455170 543705 disk_worker.go:728] disk inode is not compliant
E0320 10:02:14.456945 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 10:02:14.456955 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 10:02:14.456962 543705 custom_config.go:64] query custom config with name: gpu
I0320 10:02:14.457006 543705 disk_worker.go:494] system disk:vda1
I0320 10:02:14.457032 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 10:02:15.456857 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 10:02:15.456866 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:02:16.457928 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 10:02:16.457928 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 10:02:16.457984 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:02:16.458004 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:02:16.472332 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:02:23.409813 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:02:23.409835 543705 memory.go:184] no items to output this cycle
I0320 10:02:23.409844 543705 cpu.go:275] no items to output this cycle
I0320 10:02:23.751923 543705 disk_info.go:125] begin check local disk info of client
I0320 10:02:23.754456 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:02:23.754461 543705 disk_info.go:196] parse disk info done, disk is : [0xc000546cc0 0xc000546d00]
E0320 10:02:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:02:33.409775 543705 memory.go:184] no items to output this cycle
I0320 10:02:33.409782 543705 cpu.go:275] no items to output this cycle
E0320 10:02:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:02:43.409791 543705 memory.go:191] Add success.
I0320 10:02:43.409806 543705 cpu.go:282] Add success.
I0320 10:02:43.420027 543705 net.go:648] Add success.
I0320 10:02:43.422861 543705 net.go:770] primary dev: ETH0
I0320 10:02:43.422876 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:02:43.422889 543705 net.go:698] Add success.
I0320 10:02:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:02:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:02:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:02:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:02:53.409784 543705 memory.go:184] no items to output this cycle
I0320 10:02:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 10:03:03.409900 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:03:03.409920 543705 memory.go:184] no items to output this cycle
I0320 10:03:03.409950 543705 cpu.go:275] no items to output this cycle
E0320 10:03:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:03:13.409783 543705 memory.go:191] Add success.
W0320 10:03:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 10:03:13.409814 543705 cpu.go:282] Add success.
W0320 10:03:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:03:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:03:13.420094 543705 net.go:648] Add success.
I0320 10:03:13.422871 543705 net.go:770] primary dev: ETH0
I0320 10:03:13.422884 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:03:13.422896 543705 net.go:698] Add success.
I0320 10:03:13.468942 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7bb75d9c-f47f-49ab-963e-4888e0b973a5","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 10:03:13.468983 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 10:03:14.454949 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:03:14.455092 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:03:14.455154 543705 disk_worker.go:708] disk space is not compliant
W0320 10:03:14.455157 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:03:14.456531 543705 disk_worker.go:494] system disk:vda1
I0320 10:03:14.456574 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:03:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:03:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:03:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:03:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:03:16.472401 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:03:23.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:03:23.409815 543705 memory.go:184] no items to output this cycle
I0320 10:03:23.409817 543705 cpu.go:275] no items to output this cycle
I0320 10:03:23.754541 543705 disk_info.go:125] begin check local disk info of client
I0320 10:03:23.757027 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:03:23.757033 543705 disk_info.go:196] parse disk info done, disk is : [0xc0005464c0 0xc000546500]
E0320 10:03:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:03:33.409770 543705 memory.go:184] no items to output this cycle
I0320 10:03:33.409806 543705 cpu.go:275] no items to output this cycle
I0320 10:03:38.421872 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 10:03:38.421880 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 10:03:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:03:43.410642 543705 memory.go:191] Add success.
I0320 10:03:43.409824 543705 cpu.go:282] Add success.
I0320 10:03:43.420434 543705 net.go:648] Add success.
I0320 10:03:43.422861 543705 net.go:770] primary dev: ETH0
I0320 10:03:43.422875 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:03:43.422889 543705 net.go:698] Add success.
I0320 10:03:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:03:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:03:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:03:53.409868 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:03:53.409888 543705 memory.go:184] no items to output this cycle
I0320 10:03:53.409923 543705 cpu.go:275] no items to output this cycle
E0320 10:04:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:04:03.409776 543705 memory.go:184] no items to output this cycle
I0320 10:04:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 10:04:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:04:13.409791 543705 memory.go:191] Add success.
I0320 10:04:13.409791 543705 cpu.go:282] Add success.
W0320 10:04:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:04:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:04:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:04:13.420172 543705 net.go:648] Add success.
I0320 10:04:13.423123 543705 net.go:770] primary dev: ETH0
I0320 10:04:13.423137 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:04:13.423148 543705 net.go:698] Add success.
I0320 10:04:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:04:14.455126 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:04:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0320 10:04:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:04:14.456591 543705 disk_worker.go:494] system disk:vda1
I0320 10:04:14.456619 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:04:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:04:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:04:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:04:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:04:16.472385 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:04:23.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:04:23.409820 543705 memory.go:184] no items to output this cycle
I0320 10:04:23.409829 543705 cpu.go:275] no items to output this cycle
I0320 10:04:23.757671 543705 disk_info.go:125] begin check local disk info of client
I0320 10:04:23.760206 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:04:23.760212 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003dc440 0xc0003dc480]
E0320 10:04:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:04:33.409805 543705 memory.go:184] no items to output this cycle
I0320 10:04:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 10:04:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:04:43.409792 543705 memory.go:191] Add success.
I0320 10:04:43.409795 543705 cpu.go:282] Add success.
I0320 10:04:43.419873 543705 net.go:648] Add success.
I0320 10:04:43.422555 543705 net.go:770] primary dev: ETH0
I0320 10:04:43.422570 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:04:43.422584 543705 net.go:698] Add success.
I0320 10:04:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:04:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:04:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:04:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:04:53.409779 543705 memory.go:184] no items to output this cycle
I0320 10:04:53.409782 543705 cpu.go:275] no items to output this cycle
E0320 10:05:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:05:03.409804 543705 memory.go:184] no items to output this cycle
I0320 10:05:03.409815 543705 cpu.go:275] no items to output this cycle
E0320 10:05:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:05:13.409784 543705 memory.go:191] Add success.
I0320 10:05:13.409791 543705 cpu.go:282] Add success.
W0320 10:05:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:05:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:05:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:05:13.420061 543705 net.go:648] Add success.
I0320 10:05:13.422688 543705 net.go:770] primary dev: ETH0
I0320 10:05:13.422704 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:05:13.422719 543705 net.go:698] Add success.
I0320 10:05:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:05:14.455194 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:05:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0320 10:05:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:05:14.456573 543705 disk_worker.go:494] system disk:vda1
I0320 10:05:14.456605 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:05:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:05:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:05:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:05:16.458077 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:05:16.472423 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:05:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:05:23.409786 543705 cpu.go:275] no items to output this cycle
I0320 10:05:23.409788 543705 memory.go:184] no items to output this cycle
I0320 10:05:23.761671 543705 disk_info.go:125] begin check local disk info of client
I0320 10:05:23.764171 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:05:23.764176 543705 disk_info.go:196] parse disk info done, disk is : [0xc000546580 0xc0005465c0]
E0320 10:05:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:05:33.409799 543705 memory.go:184] no items to output this cycle
I0320 10:05:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 10:05:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:05:43.409800 543705 memory.go:191] Add success.
I0320 10:05:43.409801 543705 cpu.go:282] Add success.
I0320 10:05:43.420047 543705 net.go:648] Add success.
I0320 10:05:43.423057 543705 net.go:770] primary dev: ETH0
I0320 10:05:43.423071 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:05:43.423084 543705 net.go:698] Add success.
I0320 10:05:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:05:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:05:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:05:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:05:53.409767 543705 memory.go:184] no items to output this cycle
I0320 10:05:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 10:06:03.409874 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:06:03.409882 543705 cpu.go:275] no items to output this cycle
I0320 10:06:03.409910 543705 memory.go:184] no items to output this cycle
E0320 10:06:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:06:13.409780 543705 memory.go:191] Add success.
W0320 10:06:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 10:06:13.409813 543705 cpu.go:282] Add success.
W0320 10:06:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:06:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:06:13.420150 543705 net.go:648] Add success.
I0320 10:06:13.422761 543705 net.go:770] primary dev: ETH0
I0320 10:06:13.422776 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:06:13.422790 543705 net.go:698] Add success.
I0320 10:06:13.469655 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"fc75721a-12b6-40b3-8392-94e911e8d3c1","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 10:06:13.469688 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 10:06:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:06:14.455128 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:06:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 10:06:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:06:14.456605 543705 disk_worker.go:494] system disk:vda1
I0320 10:06:14.456634 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:06:15.455979 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:06:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:06:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:06:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:06:16.472453 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:06:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:06:23.409777 543705 memory.go:184] no items to output this cycle
I0320 10:06:23.409801 543705 cpu.go:275] no items to output this cycle
I0320 10:06:23.764948 543705 disk_info.go:125] begin check local disk info of client
I0320 10:06:23.767481 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:06:23.767486 543705 disk_info.go:196] parse disk info done, disk is : [0xc000396bc0 0xc000396c00]
E0320 10:06:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:06:33.409795 543705 memory.go:184] no items to output this cycle
I0320 10:06:33.409807 543705 cpu.go:275] no items to output this cycle
I0320 10:06:38.422860 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 10:06:38.422867 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 10:06:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:06:43.410759 543705 memory.go:191] Add success.
I0320 10:06:43.409827 543705 cpu.go:282] Add success.
I0320 10:06:43.420524 543705 net.go:648] Add success.
I0320 10:06:43.423234 543705 net.go:770] primary dev: ETH0
I0320 10:06:43.423250 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:06:43.423281 543705 net.go:698] Add success.
I0320 10:06:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:06:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:06:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:06:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:06:53.409764 543705 memory.go:184] no items to output this cycle
I0320 10:06:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 10:07:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:07:03.409896 543705 memory.go:184] no items to output this cycle
I0320 10:07:03.409915 543705 cpu.go:275] no items to output this cycle
E0320 10:07:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:07:13.409797 543705 memory.go:191] Add success.
I0320 10:07:13.409797 543705 cpu.go:282] Add success.
W0320 10:07:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:07:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:07:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:07:13.420176 543705 net.go:648] Add success.
I0320 10:07:13.423064 543705 net.go:770] primary dev: ETH0
I0320 10:07:13.423076 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:07:13.423087 543705 net.go:698] Add success.
I0320 10:07:13.453661 543705 event_worker.go:152] Polling the log file for events...
W0320 10:07:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:07:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 10:07:14.455191 543705 disk_worker.go:728] disk inode is not compliant
E0320 10:07:14.456996 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 10:07:14.457006 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 10:07:14.457012 543705 custom_config.go:64] query custom config with name: gpu
I0320 10:07:14.457063 543705 disk_worker.go:494] system disk:vda1
I0320 10:07:14.457096 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 10:07:15.456818 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 10:07:15.456826 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:07:16.457957 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 10:07:16.457957 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 10:07:16.458010 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:07:16.458030 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:07:16.472352 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:07:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:07:23.409778 543705 memory.go:184] no items to output this cycle
I0320 10:07:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 10:07:23.767566 543705 disk_info.go:125] begin check local disk info of client
I0320 10:07:23.770178 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:07:23.770183 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001c6900 0xc0001c6940]
E0320 10:07:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:07:33.409794 543705 memory.go:184] no items to output this cycle
I0320 10:07:33.409806 543705 cpu.go:275] no items to output this cycle
E0320 10:07:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:07:43.409799 543705 memory.go:191] Add success.
I0320 10:07:43.409801 543705 cpu.go:282] Add success.
I0320 10:07:43.419879 543705 net.go:648] Add success.
I0320 10:07:43.422717 543705 net.go:770] primary dev: ETH0
I0320 10:07:43.422733 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:07:43.422745 543705 net.go:698] Add success.
I0320 10:07:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:07:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:07:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:07:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:07:53.409763 543705 memory.go:184] no items to output this cycle
I0320 10:07:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 10:08:03.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:08:03.409763 543705 memory.go:184] no items to output this cycle
I0320 10:08:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 10:08:13.409903 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:08:13.409946 543705 memory.go:191] Add success.
W0320 10:08:13.409994 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:08:13.410013 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:08:13.410015 543705 cpu.go:282] Add success.
I0320 10:08:13.410020 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:08:13.419713 543705 net.go:648] Add success.
I0320 10:08:13.422263 543705 net.go:770] primary dev: ETH0
I0320 10:08:13.422276 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:08:13.422287 543705 net.go:698] Add success.
I0320 10:08:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:08:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:08:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 10:08:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:08:14.456574 543705 disk_worker.go:494] system disk:vda1
I0320 10:08:14.456602 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:08:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:08:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:08:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:08:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:08:16.472390 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:08:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:08:23.409806 543705 memory.go:184] no items to output this cycle
I0320 10:08:23.409813 543705 cpu.go:275] no items to output this cycle
I0320 10:08:23.770264 543705 disk_info.go:125] begin check local disk info of client
I0320 10:08:23.772839 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:08:23.772846 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5200 0xc0000c5240]
E0320 10:08:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:08:33.409774 543705 memory.go:184] no items to output this cycle
I0320 10:08:33.409779 543705 cpu.go:275] no items to output this cycle
E0320 10:08:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:08:43.409797 543705 memory.go:191] Add success.
I0320 10:08:43.409797 543705 cpu.go:282] Add success.
I0320 10:08:43.419879 543705 net.go:648] Add success.
I0320 10:08:43.422891 543705 net.go:770] primary dev: ETH0
I0320 10:08:43.422906 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:08:43.422921 543705 net.go:698] Add success.
I0320 10:08:46.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:08:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:08:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:08:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:08:53.409767 543705 memory.go:184] no items to output this cycle
I0320 10:08:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 10:09:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:09:03.409801 543705 memory.go:184] no items to output this cycle
I0320 10:09:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 10:09:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:09:13.409790 543705 memory.go:191] Add success.
I0320 10:09:13.409807 543705 cpu.go:282] Add success.
W0320 10:09:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:09:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:09:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:09:13.419736 543705 net.go:648] Add success.
I0320 10:09:13.422571 543705 net.go:770] primary dev: ETH0
I0320 10:09:13.422584 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:09:13.422595 543705 net.go:698] Add success.
I0320 10:09:13.468397 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3d0f52a5-1aa7-412a-b300-4b79bfb5d7be","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 10:09:13.468428 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 10:09:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:09:14.455168 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:09:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0320 10:09:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:09:14.456527 543705 disk_worker.go:494] system disk:vda1
I0320 10:09:14.456571 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:09:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:09:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:09:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:09:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:09:16.472389 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:09:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:09:23.409809 543705 memory.go:184] no items to output this cycle
I0320 10:09:23.409818 543705 cpu.go:275] no items to output this cycle
I0320 10:09:23.773672 543705 disk_info.go:125] begin check local disk info of client
I0320 10:09:23.776186 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:09:23.776191 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001c6440 0xc0001c6480]
E0320 10:09:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:09:33.409779 543705 memory.go:184] no items to output this cycle
I0320 10:09:33.409786 543705 cpu.go:275] no items to output this cycle
I0320 10:09:38.423868 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 10:09:38.423876 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 10:09:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:09:43.410706 543705 memory.go:191] Add success.
I0320 10:09:43.409818 543705 cpu.go:282] Add success.
I0320 10:09:43.420395 543705 net.go:648] Add success.
I0320 10:09:43.423504 543705 net.go:770] primary dev: ETH0
I0320 10:09:43.423518 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:09:43.423545 543705 net.go:698] Add success.
I0320 10:09:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:09:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:09:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:09:53.410274 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:09:53.410294 543705 memory.go:184] no items to output this cycle
I0320 10:09:53.410307 543705 cpu.go:275] no items to output this cycle
E0320 10:10:03.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:10:03.409760 543705 memory.go:184] no items to output this cycle
I0320 10:10:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 10:10:13.409877 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:10:13.409907 543705 memory.go:191] Add success.
W0320 10:10:13.409939 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 10:10:13.409949 543705 cpu.go:282] Add success.
W0320 10:10:13.409951 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:10:13.409958 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:10:13.419755 543705 net.go:648] Add success.
I0320 10:10:13.422632 543705 net.go:770] primary dev: ETH0
I0320 10:10:13.422647 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:10:13.422661 543705 net.go:698] Add success.
I0320 10:10:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:10:14.455106 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:10:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0320 10:10:14.455170 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:10:14.456487 543705 disk_worker.go:494] system disk:vda1
I0320 10:10:14.456531 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:10:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:10:16.458029 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:10:16.458089 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:10:16.458111 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:10:16.472411 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:10:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:10:23.409790 543705 memory.go:184] no items to output this cycle
I0320 10:10:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 10:10:23.777673 543705 disk_info.go:125] begin check local disk info of client
I0320 10:10:23.780196 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:10:23.780201 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048b7c0 0xc00048b800]
E0320 10:10:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:10:33.409766 543705 memory.go:184] no items to output this cycle
I0320 10:10:33.409806 543705 cpu.go:275] no items to output this cycle
E0320 10:10:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:10:43.409811 543705 memory.go:191] Add success.
I0320 10:10:43.409816 543705 cpu.go:282] Add success.
I0320 10:10:43.419707 543705 net.go:770] primary dev: ETH0
I0320 10:10:43.419720 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:10:43.419733 543705 net.go:698] Add success.
I0320 10:10:43.419964 543705 net.go:648] Add success.
I0320 10:10:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:10:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:10:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:10:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:10:53.409769 543705 memory.go:184] no items to output this cycle
I0320 10:10:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 10:11:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:11:03.409803 543705 memory.go:184] no items to output this cycle
I0320 10:11:03.409822 543705 cpu.go:275] no items to output this cycle
E0320 10:11:13.409886 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:11:13.409914 543705 memory.go:191] Add success.
I0320 10:11:13.409917 543705 cpu.go:282] Add success.
W0320 10:11:13.409947 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:11:13.409965 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:11:13.409994 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:11:13.419726 543705 net.go:648] Add success.
I0320 10:11:13.422589 543705 net.go:770] primary dev: ETH0
I0320 10:11:13.422604 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:11:13.422617 543705 net.go:698] Add success.
I0320 10:11:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:11:14.455206 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:11:14.455216 543705 disk_worker.go:708] disk space is not compliant
W0320 10:11:14.455219 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:11:14.456588 543705 disk_worker.go:494] system disk:vda1
I0320 10:11:14.456616 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:11:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:11:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:11:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:11:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:11:16.472390 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:11:23.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:11:23.409817 543705 memory.go:184] no items to output this cycle
I0320 10:11:23.409824 543705 cpu.go:275] no items to output this cycle
I0320 10:11:23.781047 543705 disk_info.go:125] begin check local disk info of client
I0320 10:11:23.783588 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:11:23.783594 543705 disk_info.go:196] parse disk info done, disk is : [0xc000328200 0xc000328240]
E0320 10:11:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:11:33.409783 543705 memory.go:184] no items to output this cycle
I0320 10:11:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 10:11:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:11:43.409826 543705 memory.go:191] Add success.
I0320 10:11:43.409834 543705 cpu.go:282] Add success.
I0320 10:11:43.420002 543705 net.go:648] Add success.
I0320 10:11:43.423116 543705 net.go:770] primary dev: ETH0
I0320 10:11:43.423131 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:11:43.423146 543705 net.go:698] Add success.
I0320 10:11:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:11:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:11:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:11:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:11:53.409773 543705 memory.go:184] no items to output this cycle
I0320 10:11:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 10:12:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:12:03.409783 543705 memory.go:184] no items to output this cycle
I0320 10:12:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 10:12:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:12:13.409804 543705 memory.go:191] Add success.
I0320 10:12:13.409808 543705 cpu.go:282] Add success.
W0320 10:12:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:12:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:12:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:12:13.419717 543705 net.go:648] Add success.
I0320 10:12:13.422967 543705 net.go:770] primary dev: ETH0
I0320 10:12:13.422979 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:12:13.422990 543705 net.go:698] Add success.
I0320 10:12:13.469656 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"582f2092-2eae-48d9-ac27-da1b71b11a84","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 10:12:13.469688 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 10:12:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:12:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0320 10:12:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:12:14.456840 543705 disk_worker.go:494] system disk:vda1
I0320 10:12:14.456879 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 10:12:14.457095 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 10:12:14.457102 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 10:12:14.457106 543705 custom_config.go:64] query custom config with name: gpu
E0320 10:12:15.456820 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 10:12:15.456830 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:12:16.457894 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 10:12:16.457894 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 10:12:16.457948 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:12:16.457967 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:12:16.472301 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:12:23.409802 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:12:23.409823 543705 memory.go:184] no items to output this cycle
I0320 10:12:23.409832 543705 cpu.go:275] no items to output this cycle
I0320 10:12:23.785082 543705 disk_info.go:125] begin check local disk info of client
I0320 10:12:23.787627 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:12:23.787632 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b400 0xc00007b440]
E0320 10:12:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:12:33.409768 543705 memory.go:184] no items to output this cycle
I0320 10:12:33.409811 543705 cpu.go:275] no items to output this cycle
I0320 10:12:38.424873 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 10:12:38.424880 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 10:12:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:12:43.410644 543705 memory.go:191] Add success.
I0320 10:12:43.409813 543705 cpu.go:282] Add success.
I0320 10:12:43.420324 543705 net.go:648] Add success.
I0320 10:12:43.423098 543705 net.go:770] primary dev: ETH0
I0320 10:12:43.423112 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:12:43.423124 543705 net.go:698] Add success.
I0320 10:12:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:12:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:12:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:12:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:12:53.409793 543705 memory.go:184] no items to output this cycle
I0320 10:12:53.409816 543705 cpu.go:275] no items to output this cycle
E0320 10:13:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:13:03.409782 543705 memory.go:184] no items to output this cycle
I0320 10:13:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 10:13:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:13:13.409778 543705 memory.go:191] Add success.
W0320 10:13:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 10:13:13.409811 543705 cpu.go:282] Add success.
W0320 10:13:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:13:13.409819 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:13:13.420292 543705 net.go:648] Add success.
I0320 10:13:13.423233 543705 net.go:770] primary dev: ETH0
I0320 10:13:13.423246 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:13:13.423258 543705 net.go:698] Add success.
I0320 10:13:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:13:14.455149 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:13:14.455159 543705 disk_worker.go:708] disk space is not compliant
W0320 10:13:14.455162 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:13:14.456482 543705 disk_worker.go:494] system disk:vda1
I0320 10:13:14.456526 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:13:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:13:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:13:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:13:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:13:16.472390 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:13:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:13:23.409776 543705 memory.go:184] no items to output this cycle
I0320 10:13:23.409802 543705 cpu.go:275] no items to output this cycle
I0320 10:13:23.789083 543705 disk_info.go:125] begin check local disk info of client
I0320 10:13:23.791623 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:13:23.791629 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001c6d80 0xc0001c6dc0]
E0320 10:13:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:13:33.409768 543705 memory.go:184] no items to output this cycle
I0320 10:13:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 10:13:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:13:43.409789 543705 memory.go:191] Add success.
I0320 10:13:43.409816 543705 cpu.go:282] Add success.
I0320 10:13:43.419854 543705 net.go:648] Add success.
I0320 10:13:43.422829 543705 net.go:770] primary dev: ETH0
I0320 10:13:43.422845 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:13:43.422861 543705 net.go:698] Add success.
I0320 10:13:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:13:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:13:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:13:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:13:53.409794 543705 memory.go:184] no items to output this cycle
I0320 10:13:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 10:14:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:14:03.409781 543705 memory.go:184] no items to output this cycle
I0320 10:14:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 10:14:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:14:13.409799 543705 memory.go:191] Add success.
I0320 10:14:13.409802 543705 cpu.go:282] Add success.
W0320 10:14:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:14:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:14:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:14:13.420071 543705 net.go:648] Add success.
I0320 10:14:13.422856 543705 net.go:770] primary dev: ETH0
I0320 10:14:13.422868 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:14:13.422884 543705 net.go:698] Add success.
I0320 10:14:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:14:14.455432 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:14:14.455443 543705 disk_worker.go:708] disk space is not compliant
W0320 10:14:14.455447 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:14:14.457057 543705 disk_worker.go:494] system disk:vda1
I0320 10:14:14.457086 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:14:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:14:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:14:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:14:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:14:16.472373 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:14:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:14:23.409787 543705 memory.go:184] no items to output this cycle
I0320 10:14:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 10:14:23.793068 543705 disk_info.go:125] begin check local disk info of client
I0320 10:14:23.795628 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:14:23.795633 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb4c0 0xc0001fb500]
E0320 10:14:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:14:33.409770 543705 memory.go:184] no items to output this cycle
I0320 10:14:33.409807 543705 cpu.go:275] no items to output this cycle
E0320 10:14:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:14:43.409793 543705 memory.go:191] Add success.
I0320 10:14:43.409799 543705 cpu.go:282] Add success.
I0320 10:14:43.419974 543705 net.go:648] Add success.
I0320 10:14:43.422899 543705 net.go:770] primary dev: ETH0
I0320 10:14:43.422912 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:14:43.422925 543705 net.go:698] Add success.
I0320 10:14:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:14:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:14:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:14:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:14:53.409764 543705 memory.go:184] no items to output this cycle
I0320 10:14:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 10:15:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:15:03.409776 543705 memory.go:184] no items to output this cycle
I0320 10:15:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 10:15:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:15:13.409800 543705 memory.go:191] Add success.
I0320 10:15:13.409806 543705 cpu.go:282] Add success.
W0320 10:15:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:15:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:15:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:15:13.420111 543705 net.go:648] Add success.
I0320 10:15:13.422646 543705 net.go:770] primary dev: ETH0
I0320 10:15:13.422660 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:15:13.422672 543705 net.go:698] Add success.
I0320 10:15:13.565525 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1c5a9617-9293-487e-89d8-cd33819fc473","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 10:15:13.565559 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 10:15:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:15:14.455140 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:15:14.455331 543705 disk_worker.go:708] disk space is not compliant
W0320 10:15:14.455336 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:15:14.456955 543705 disk_worker.go:494] system disk:vda1
I0320 10:15:14.456985 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:15:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:15:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:15:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:15:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:15:16.472418 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:15:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:15:23.409778 543705 memory.go:184] no items to output this cycle
I0320 10:15:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 10:15:23.795715 543705 disk_info.go:125] begin check local disk info of client
I0320 10:15:23.798352 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:15:23.798357 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329440 0xc000329480]
E0320 10:15:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:15:33.409801 543705 memory.go:184] no items to output this cycle
I0320 10:15:33.409817 543705 cpu.go:275] no items to output this cycle
I0320 10:15:38.425885 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 10:15:38.425892 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 10:15:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:15:43.410935 543705 memory.go:191] Add success.
I0320 10:15:43.409827 543705 cpu.go:282] Add success.
I0320 10:15:43.420704 543705 net.go:648] Add success.
I0320 10:15:43.423376 543705 net.go:770] primary dev: ETH0
I0320 10:15:43.423390 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:15:43.423403 543705 net.go:698] Add success.
I0320 10:15:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:15:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:15:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:15:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:15:53.409776 543705 memory.go:184] no items to output this cycle
I0320 10:15:53.409778 543705 cpu.go:275] no items to output this cycle
E0320 10:16:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:16:03.409769 543705 memory.go:184] no items to output this cycle
I0320 10:16:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 10:16:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:16:13.409823 543705 memory.go:191] Add success.
I0320 10:16:13.409831 543705 cpu.go:282] Add success.
W0320 10:16:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:16:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:16:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:16:13.420167 543705 net.go:648] Add success.
I0320 10:16:13.422786 543705 net.go:770] primary dev: ETH0
I0320 10:16:13.422799 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:16:13.422811 543705 net.go:698] Add success.
I0320 10:16:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:16:14.455138 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:16:14.455211 543705 disk_worker.go:708] disk space is not compliant
W0320 10:16:14.455214 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:16:14.459245 543705 disk_worker.go:494] system disk:vda1
I0320 10:16:14.459273 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:16:15.455948 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:16:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:16:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:16:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:16:16.472379 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:16:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:16:23.409789 543705 memory.go:184] no items to output this cycle
I0320 10:16:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 10:16:23.800124 543705 disk_info.go:125] begin check local disk info of client
I0320 10:16:23.802666 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:16:23.802671 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329340 0xc000329380]
E0320 10:16:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:16:33.409777 543705 memory.go:184] no items to output this cycle
I0320 10:16:33.409777 543705 cpu.go:275] no items to output this cycle
E0320 10:16:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:16:43.409794 543705 memory.go:191] Add success.
I0320 10:16:43.409795 543705 cpu.go:282] Add success.
I0320 10:16:43.419864 543705 net.go:648] Add success.
I0320 10:16:43.422331 543705 net.go:770] primary dev: ETH0
I0320 10:16:43.422345 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:16:43.422357 543705 net.go:698] Add success.
I0320 10:16:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:16:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:16:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:16:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:16:53.409769 543705 memory.go:184] no items to output this cycle
I0320 10:16:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 10:17:03.409742 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:17:03.409758 543705 memory.go:184] no items to output this cycle
I0320 10:17:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 10:17:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:17:13.409819 543705 memory.go:191] Add success.
I0320 10:17:13.409824 543705 cpu.go:282] Add success.
W0320 10:17:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:17:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:17:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:17:13.420056 543705 net.go:648] Add success.
I0320 10:17:13.422784 543705 net.go:770] primary dev: ETH0
I0320 10:17:13.422796 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:17:13.422809 543705 net.go:698] Add success.
I0320 10:17:13.453339 543705 event_worker.go:152] Polling the log file for events...
W0320 10:17:14.455152 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:17:14.455253 543705 disk_worker.go:708] disk space is not compliant
W0320 10:17:14.455258 543705 disk_worker.go:728] disk inode is not compliant
E0320 10:17:14.456059 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 10:17:14.456069 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 10:17:14.456075 543705 custom_config.go:64] query custom config with name: gpu
I0320 10:17:14.457017 543705 disk_worker.go:494] system disk:vda1
I0320 10:17:14.457127 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 10:17:15.456827 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 10:17:15.456835 543705 custom_config.go:64] query custom config with name: huawei_npu
E0320 10:17:16.457567 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 10:17:16.457582 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:17:16.457631 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:17:16.457666 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:17:16.472990 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:17:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:17:23.409807 543705 memory.go:184] no items to output this cycle
I0320 10:17:23.409815 543705 cpu.go:275] no items to output this cycle
I0320 10:17:23.804150 543705 disk_info.go:125] begin check local disk info of client
I0320 10:17:23.806726 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:17:23.806731 543705 disk_info.go:196] parse disk info done, disk is : [0xc000328200 0xc000328240]
E0320 10:17:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:17:33.409777 543705 memory.go:184] no items to output this cycle
I0320 10:17:33.409785 543705 cpu.go:275] no items to output this cycle
E0320 10:17:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:17:43.409806 543705 memory.go:191] Add success.
I0320 10:17:43.409805 543705 cpu.go:282] Add success.
I0320 10:17:43.419867 543705 net.go:648] Add success.
I0320 10:17:43.422169 543705 net.go:770] primary dev: ETH0
I0320 10:17:43.422182 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:17:43.422194 543705 net.go:698] Add success.
I0320 10:17:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:17:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:17:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:17:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:17:53.409769 543705 memory.go:184] no items to output this cycle
I0320 10:17:53.409784 543705 cpu.go:275] no items to output this cycle
E0320 10:18:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:18:03.409767 543705 memory.go:184] no items to output this cycle
I0320 10:18:03.409838 543705 cpu.go:275] no items to output this cycle
E0320 10:18:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:18:13.409813 543705 memory.go:191] Add success.
I0320 10:18:13.409826 543705 cpu.go:282] Add success.
W0320 10:18:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:18:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:18:13.409859 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:18:13.420163 543705 net.go:648] Add success.
I0320 10:18:13.422883 543705 net.go:770] primary dev: ETH0
I0320 10:18:13.422896 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:18:13.422909 543705 net.go:698] Add success.
I0320 10:18:13.470666 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"666b6968-a603-4e4c-940c-a83a0ab8379d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 10:18:13.470698 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 10:18:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:18:14.455169 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:18:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 10:18:14.455182 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:18:14.456537 543705 disk_worker.go:494] system disk:vda1
I0320 10:18:14.456593 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:18:15.455983 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:18:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:18:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:18:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:18:16.472401 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:18:23.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:18:23.409821 543705 memory.go:184] no items to output this cycle
I0320 10:18:23.409826 543705 cpu.go:275] no items to output this cycle
I0320 10:18:23.808176 543705 disk_info.go:125] begin check local disk info of client
I0320 10:18:23.810739 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:18:23.810745 543705 disk_info.go:196] parse disk info done, disk is : [0xc000397cc0 0xc000397d00]
E0320 10:18:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:18:33.409772 543705 memory.go:184] no items to output this cycle
I0320 10:18:33.409818 543705 cpu.go:275] no items to output this cycle
I0320 10:18:38.426888 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 10:18:38.426895 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 10:18:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:18:43.410680 543705 memory.go:191] Add success.
I0320 10:18:43.409808 543705 cpu.go:282] Add success.
I0320 10:18:43.420451 543705 net.go:648] Add success.
I0320 10:18:43.423001 543705 net.go:770] primary dev: ETH0
I0320 10:18:43.423017 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:18:43.423032 543705 net.go:698] Add success.
I0320 10:18:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:18:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:18:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:18:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:18:53.409780 543705 memory.go:184] no items to output this cycle
I0320 10:18:53.409782 543705 cpu.go:275] no items to output this cycle
E0320 10:19:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:19:03.409775 543705 memory.go:184] no items to output this cycle
I0320 10:19:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 10:19:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:19:13.409795 543705 cpu.go:282] Add success.
I0320 10:19:13.409796 543705 memory.go:191] Add success.
W0320 10:19:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:19:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:19:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:19:13.420082 543705 net.go:648] Add success.
I0320 10:19:13.422830 543705 net.go:770] primary dev: ETH0
I0320 10:19:13.422843 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:19:13.422863 543705 net.go:698] Add success.
I0320 10:19:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:19:14.455200 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:19:14.455211 543705 disk_worker.go:708] disk space is not compliant
W0320 10:19:14.455214 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:19:14.456590 543705 disk_worker.go:494] system disk:vda1
I0320 10:19:14.456621 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:19:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:19:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:19:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:19:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:19:16.472374 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:19:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:19:23.409792 543705 memory.go:184] no items to output this cycle
I0320 10:19:23.409794 543705 cpu.go:275] no items to output this cycle
I0320 10:19:23.812167 543705 disk_info.go:125] begin check local disk info of client
I0320 10:19:23.814750 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:19:23.814758 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001c7700 0xc0001c7740]
E0320 10:19:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:19:33.409779 543705 memory.go:184] no items to output this cycle
I0320 10:19:33.409781 543705 cpu.go:275] no items to output this cycle
E0320 10:19:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:19:43.409801 543705 memory.go:191] Add success.
I0320 10:19:43.409802 543705 cpu.go:282] Add success.
I0320 10:19:43.420258 543705 net.go:648] Add success.
I0320 10:19:43.422777 543705 net.go:770] primary dev: ETH0
I0320 10:19:43.422791 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:19:43.422803 543705 net.go:698] Add success.
I0320 10:19:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:19:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:19:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:19:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:19:53.409768 543705 memory.go:184] no items to output this cycle
I0320 10:19:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 10:20:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:20:03.409802 543705 memory.go:184] no items to output this cycle
I0320 10:20:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 10:20:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:20:13.409813 543705 memory.go:191] Add success.
I0320 10:20:13.409819 543705 cpu.go:282] Add success.
W0320 10:20:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:20:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:20:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:20:13.420144 543705 net.go:648] Add success.
I0320 10:20:13.422855 543705 net.go:770] primary dev: ETH0
I0320 10:20:13.422869 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:20:13.422884 543705 net.go:698] Add success.
I0320 10:20:14.454978 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:20:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:20:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 10:20:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:20:14.456581 543705 disk_worker.go:494] system disk:vda1
I0320 10:20:14.456613 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:20:15.456009 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:20:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:20:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:20:16.458079 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:20:16.472443 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:20:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:20:23.409787 543705 memory.go:184] no items to output this cycle
I0320 10:20:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 10:20:23.816183 543705 disk_info.go:125] begin check local disk info of client
I0320 10:20:23.818734 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:20:23.818739 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001c7e80 0xc0001c7ec0]
E0320 10:20:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:20:33.409803 543705 memory.go:184] no items to output this cycle
I0320 10:20:33.409814 543705 cpu.go:275] no items to output this cycle
E0320 10:20:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:20:43.409815 543705 memory.go:191] Add success.
I0320 10:20:43.409824 543705 cpu.go:282] Add success.
I0320 10:20:43.420062 543705 net.go:648] Add success.
I0320 10:20:43.422691 543705 net.go:770] primary dev: ETH0
I0320 10:20:43.422706 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:20:43.422719 543705 net.go:698] Add success.
I0320 10:20:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:20:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:20:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:20:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:20:53.409799 543705 memory.go:184] no items to output this cycle
I0320 10:20:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 10:21:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:21:03.409781 543705 memory.go:184] no items to output this cycle
I0320 10:21:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 10:21:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:21:13.409810 543705 memory.go:191] Add success.
I0320 10:21:13.409822 543705 cpu.go:282] Add success.
W0320 10:21:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:21:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:21:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:21:13.420280 543705 net.go:648] Add success.
I0320 10:21:13.422973 543705 net.go:770] primary dev: ETH0
I0320 10:21:13.422987 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:21:13.423000 543705 net.go:698] Add success.
I0320 10:21:13.464174 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"6f208b6f-12d5-45e6-b5b0-36994d49c063","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 10:21:13.464207 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 10:21:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:21:14.455094 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:21:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0320 10:21:14.455162 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:21:14.456506 543705 disk_worker.go:494] system disk:vda1
I0320 10:21:14.456563 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:21:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:21:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:21:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:21:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:21:16.472408 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:21:23.409803 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:21:23.409825 543705 memory.go:184] no items to output this cycle
I0320 10:21:23.409834 543705 cpu.go:275] no items to output this cycle
I0320 10:21:23.818823 543705 disk_info.go:125] begin check local disk info of client
I0320 10:21:23.821375 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:21:23.821381 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001c7180 0xc0001c71c0]
E0320 10:21:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:21:33.409766 543705 memory.go:184] no items to output this cycle
I0320 10:21:33.409797 543705 cpu.go:275] no items to output this cycle
I0320 10:21:38.427897 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 10:21:38.427905 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 10:21:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:21:43.410847 543705 memory.go:191] Add success.
I0320 10:21:43.409824 543705 cpu.go:282] Add success.
I0320 10:21:43.420614 543705 net.go:648] Add success.
I0320 10:21:43.423215 543705 net.go:770] primary dev: ETH0
I0320 10:21:43.423229 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:21:43.423243 543705 net.go:698] Add success.
I0320 10:21:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:21:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:21:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:21:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:21:53.409768 543705 memory.go:184] no items to output this cycle
I0320 10:21:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 10:22:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:22:03.409782 543705 memory.go:184] no items to output this cycle
I0320 10:22:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 10:22:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:22:13.409788 543705 memory.go:191] Add success.
I0320 10:22:13.409789 543705 cpu.go:282] Add success.
W0320 10:22:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:22:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:22:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:22:13.420314 543705 net.go:648] Add success.
I0320 10:22:13.423132 543705 net.go:770] primary dev: ETH0
I0320 10:22:13.423146 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:22:13.423158 543705 net.go:698] Add success.
W0320 10:22:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:22:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 10:22:14.455186 543705 disk_worker.go:728] disk inode is not compliant
E0320 10:22:14.455865 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 10:22:14.455874 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 10:22:14.455880 543705 custom_config.go:64] query custom config with name: gpu
I0320 10:22:14.456536 543705 disk_worker.go:494] system disk:vda1
I0320 10:22:14.456567 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 10:22:15.456784 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 10:22:15.456793 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:22:16.457919 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 10:22:16.457920 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 10:22:16.457974 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:22:16.457994 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:22:16.472317 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:22:23.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:22:23.409813 543705 memory.go:184] no items to output this cycle
I0320 10:22:23.409821 543705 cpu.go:275] no items to output this cycle
I0320 10:22:23.821672 543705 disk_info.go:125] begin check local disk info of client
I0320 10:22:23.824255 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:22:23.824261 543705 disk_info.go:196] parse disk info done, disk is : [0xc00046fc40 0xc00046fc80]
E0320 10:22:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:22:33.409771 543705 memory.go:184] no items to output this cycle
I0320 10:22:33.409789 543705 cpu.go:275] no items to output this cycle
E0320 10:22:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:22:43.409816 543705 memory.go:191] Add success.
I0320 10:22:43.409823 543705 cpu.go:282] Add success.
I0320 10:22:43.419970 543705 net.go:648] Add success.
I0320 10:22:43.422709 543705 net.go:770] primary dev: ETH0
I0320 10:22:43.422721 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:22:43.422734 543705 net.go:698] Add success.
I0320 10:22:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:22:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:22:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:22:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:22:53.409797 543705 memory.go:184] no items to output this cycle
I0320 10:22:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 10:23:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:23:03.409785 543705 memory.go:184] no items to output this cycle
I0320 10:23:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 10:23:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:23:13.409784 543705 memory.go:191] Add success.
I0320 10:23:13.409785 543705 cpu.go:282] Add success.
W0320 10:23:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:23:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:23:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:23:13.420216 543705 net.go:648] Add success.
I0320 10:23:13.422979 543705 net.go:770] primary dev: ETH0
I0320 10:23:13.423008 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:23:13.423023 543705 net.go:698] Add success.
I0320 10:23:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:23:14.455149 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:23:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0320 10:23:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:23:14.456488 543705 disk_worker.go:494] system disk:vda1
I0320 10:23:14.456536 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:23:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:23:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:23:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:23:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:23:16.472406 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:23:23.409869 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:23:23.409894 543705 memory.go:184] no items to output this cycle
I0320 10:23:23.410064 543705 cpu.go:275] no items to output this cycle
I0320 10:23:23.825669 543705 disk_info.go:125] begin check local disk info of client
I0320 10:23:23.828199 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:23:23.828206 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b9140 0xc0002b9180]
E0320 10:23:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:23:33.409767 543705 memory.go:184] no items to output this cycle
I0320 10:23:33.409797 543705 cpu.go:275] no items to output this cycle
E0320 10:23:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:23:43.409818 543705 memory.go:191] Add success.
I0320 10:23:43.409821 543705 cpu.go:282] Add success.
I0320 10:23:43.420085 543705 net.go:648] Add success.
I0320 10:23:43.423007 543705 net.go:770] primary dev: ETH0
I0320 10:23:43.423020 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:23:43.423046 543705 net.go:698] Add success.
I0320 10:23:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:23:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:23:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:23:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:23:53.409775 543705 cpu.go:275] no items to output this cycle
I0320 10:23:53.409782 543705 memory.go:184] no items to output this cycle
E0320 10:24:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:24:03.409807 543705 memory.go:184] no items to output this cycle
I0320 10:24:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 10:24:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:24:13.409780 543705 memory.go:191] Add success.
I0320 10:24:13.409802 543705 cpu.go:282] Add success.
W0320 10:24:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:24:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:24:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:24:13.420159 543705 net.go:648] Add success.
I0320 10:24:13.422814 543705 net.go:770] primary dev: ETH0
I0320 10:24:13.422828 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:24:13.422841 543705 net.go:698] Add success.
I0320 10:24:13.463999 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5288ab69-56b6-492d-98ae-dba7aaf5fd21","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 10:24:13.464032 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 10:24:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:24:14.455204 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:24:14.455215 543705 disk_worker.go:708] disk space is not compliant
W0320 10:24:14.455219 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:24:14.456610 543705 disk_worker.go:494] system disk:vda1
I0320 10:24:14.456640 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:24:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:24:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:24:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:24:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:24:16.472394 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:24:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:24:23.409783 543705 memory.go:184] no items to output this cycle
I0320 10:24:23.409789 543705 cpu.go:275] no items to output this cycle
I0320 10:24:23.828294 543705 disk_info.go:125] begin check local disk info of client
I0320 10:24:23.830919 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:24:23.830925 543705 disk_info.go:196] parse disk info done, disk is : [0xc00025a000 0xc00025a040]
E0320 10:24:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:24:33.409780 543705 memory.go:184] no items to output this cycle
I0320 10:24:33.409796 543705 cpu.go:275] no items to output this cycle
I0320 10:24:38.428902 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 10:24:38.428908 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 10:24:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:24:43.410844 543705 memory.go:191] Add success.
I0320 10:24:43.409825 543705 cpu.go:282] Add success.
I0320 10:24:43.420558 543705 net.go:648] Add success.
I0320 10:24:43.423574 543705 net.go:770] primary dev: ETH0
I0320 10:24:43.423593 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:24:43.423609 543705 net.go:698] Add success.
I0320 10:24:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:24:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:24:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:24:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:24:53.409769 543705 memory.go:184] no items to output this cycle
I0320 10:24:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 10:25:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:25:03.409801 543705 memory.go:184] no items to output this cycle
I0320 10:25:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 10:25:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:25:13.409776 543705 memory.go:191] Add success.
W0320 10:25:13.409801 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 10:25:13.409808 543705 cpu.go:282] Add success.
W0320 10:25:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:25:13.409816 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:25:13.420121 543705 net.go:648] Add success.
I0320 10:25:13.423378 543705 net.go:770] primary dev: ETH0
I0320 10:25:13.423393 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:25:13.423407 543705 net.go:698] Add success.
I0320 10:25:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:25:14.455149 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:25:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0320 10:25:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:25:14.456499 543705 disk_worker.go:494] system disk:vda1
I0320 10:25:14.456542 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:25:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:25:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:25:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:25:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:25:16.472418 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:25:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:25:23.409872 543705 memory.go:184] no items to output this cycle
I0320 10:25:23.409926 543705 cpu.go:275] no items to output this cycle
I0320 10:25:23.831002 543705 disk_info.go:125] begin check local disk info of client
I0320 10:25:23.833572 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:25:23.833577 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039d780 0xc00039d7c0]
E0320 10:25:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:25:33.409801 543705 memory.go:184] no items to output this cycle
I0320 10:25:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 10:25:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:25:43.409791 543705 memory.go:191] Add success.
I0320 10:25:43.409808 543705 cpu.go:282] Add success.
I0320 10:25:43.420058 543705 net.go:648] Add success.
I0320 10:25:43.422851 543705 net.go:770] primary dev: ETH0
I0320 10:25:43.422863 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:25:43.422876 543705 net.go:698] Add success.
I0320 10:25:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:25:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:25:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:25:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:25:53.409769 543705 memory.go:184] no items to output this cycle
I0320 10:25:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 10:26:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:26:03.409781 543705 memory.go:184] no items to output this cycle
I0320 10:26:03.409784 543705 cpu.go:275] no items to output this cycle
E0320 10:26:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:26:13.409801 543705 memory.go:191] Add success.
I0320 10:26:13.409807 543705 cpu.go:282] Add success.
W0320 10:26:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:26:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:26:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:26:13.420149 543705 net.go:648] Add success.
I0320 10:26:13.422969 543705 net.go:770] primary dev: ETH0
I0320 10:26:13.422983 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:26:13.422994 543705 net.go:698] Add success.
I0320 10:26:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:26:14.455103 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:26:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0320 10:26:14.455170 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:26:14.456498 543705 disk_worker.go:494] system disk:vda1
I0320 10:26:14.456545 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:26:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:26:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:26:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:26:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:26:16.472401 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:26:23.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:26:23.409825 543705 memory.go:184] no items to output this cycle
I0320 10:26:23.409830 543705 cpu.go:275] no items to output this cycle
I0320 10:26:23.833672 543705 disk_info.go:125] begin check local disk info of client
I0320 10:26:23.836189 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:26:23.836195 543705 disk_info.go:196] parse disk info done, disk is : [0xc00025c000 0xc00025c040]
E0320 10:26:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:26:33.409777 543705 memory.go:184] no items to output this cycle
I0320 10:26:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 10:26:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:26:43.409798 543705 memory.go:191] Add success.
I0320 10:26:43.409798 543705 cpu.go:282] Add success.
I0320 10:26:43.420074 543705 net.go:648] Add success.
I0320 10:26:43.422824 543705 net.go:770] primary dev: ETH0
I0320 10:26:43.422839 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:26:43.422853 543705 net.go:698] Add success.
I0320 10:26:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:26:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:26:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:26:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:26:53.409801 543705 memory.go:184] no items to output this cycle
I0320 10:26:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 10:27:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:27:03.409784 543705 memory.go:184] no items to output this cycle
I0320 10:27:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 10:27:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:27:13.409786 543705 cpu.go:282] Add success.
I0320 10:27:13.409788 543705 memory.go:191] Add success.
W0320 10:27:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:27:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:27:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:27:13.420047 543705 net.go:648] Add success.
I0320 10:27:13.428857 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 10:27:13.428937 543705 net.go:770] primary dev: ETH0
I0320 10:27:13.428950 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:27:13.428961 543705 net.go:698] Add success.
I0320 10:27:13.453494 543705 event_worker.go:152] Polling the log file for events...
I0320 10:27:13.550954 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"52e705e7-28cc-484b-8ad9-ae504019f6d4","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 10:27:13.550989 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 10:27:14.455142 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:27:14.455153 543705 disk_worker.go:708] disk space is not compliant
W0320 10:27:14.455156 543705 disk_worker.go:728] disk inode is not compliant
E0320 10:27:14.456111 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 10:27:14.456121 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 10:27:14.456126 543705 custom_config.go:64] query custom config with name: gpu
I0320 10:27:14.456727 543705 disk_worker.go:494] system disk:vda1
I0320 10:27:14.456757 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 10:27:15.456857 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 10:27:15.456866 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:27:16.457927 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 10:27:16.457939 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 10:27:16.457976 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:27:16.457993 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:27:16.472329 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:27:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:27:23.409785 543705 memory.go:184] no items to output this cycle
I0320 10:27:23.409801 543705 cpu.go:275] no items to output this cycle
I0320 10:27:23.836274 543705 disk_info.go:125] begin check local disk info of client
I0320 10:27:23.838808 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:27:23.838814 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aac80 0xc0001aacc0]
E0320 10:27:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:27:33.409800 543705 memory.go:184] no items to output this cycle
I0320 10:27:33.409812 543705 cpu.go:275] no items to output this cycle
I0320 10:27:38.429911 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 10:27:38.429919 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 10:27:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:27:43.410699 543705 memory.go:191] Add success.
I0320 10:27:43.409812 543705 cpu.go:282] Add success.
I0320 10:27:43.420404 543705 net.go:648] Add success.
I0320 10:27:43.423045 543705 net.go:770] primary dev: ETH0
I0320 10:27:43.423060 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:27:43.423075 543705 net.go:698] Add success.
I0320 10:27:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:27:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:27:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:27:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:27:53.409775 543705 memory.go:184] no items to output this cycle
I0320 10:27:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 10:28:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:28:03.409807 543705 memory.go:184] no items to output this cycle
I0320 10:28:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 10:28:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:28:13.409782 543705 memory.go:191] Add success.
I0320 10:28:13.409808 543705 cpu.go:282] Add success.
W0320 10:28:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:28:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:28:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:28:13.420065 543705 net.go:648] Add success.
I0320 10:28:13.422668 543705 net.go:770] primary dev: ETH0
I0320 10:28:13.422681 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:28:13.422693 543705 net.go:698] Add success.
I0320 10:28:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:28:14.455174 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:28:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 10:28:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:28:14.456825 543705 disk_worker.go:494] system disk:vda1
I0320 10:28:14.456853 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:28:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:28:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:28:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:28:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:28:16.472379 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:28:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:28:23.409789 543705 memory.go:184] no items to output this cycle
I0320 10:28:23.409795 543705 cpu.go:275] no items to output this cycle
I0320 10:28:23.840327 543705 disk_info.go:125] begin check local disk info of client
I0320 10:28:23.842976 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:28:23.842982 543705 disk_info.go:196] parse disk info done, disk is : [0xc000464b40 0xc000464b80]
E0320 10:28:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:28:33.409767 543705 memory.go:184] no items to output this cycle
I0320 10:28:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 10:28:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:28:43.409793 543705 memory.go:191] Add success.
I0320 10:28:43.409797 543705 cpu.go:282] Add success.
I0320 10:28:43.419971 543705 net.go:648] Add success.
I0320 10:28:43.422568 543705 net.go:770] primary dev: ETH0
I0320 10:28:43.422581 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:28:43.422594 543705 net.go:698] Add success.
I0320 10:28:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:28:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:28:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:28:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:28:53.409785 543705 memory.go:184] no items to output this cycle
I0320 10:28:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 10:29:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:29:03.409780 543705 memory.go:184] no items to output this cycle
I0320 10:29:03.409781 543705 cpu.go:275] no items to output this cycle
E0320 10:29:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:29:13.409814 543705 memory.go:191] Add success.
I0320 10:29:13.409816 543705 cpu.go:282] Add success.
W0320 10:29:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:29:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:29:13.409856 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:29:13.420213 543705 net.go:648] Add success.
I0320 10:29:13.422895 543705 net.go:770] primary dev: ETH0
I0320 10:29:13.422910 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:29:13.423085 543705 net.go:698] Add success.
I0320 10:29:14.454947 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:29:14.455162 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:29:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0320 10:29:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:29:14.456580 543705 disk_worker.go:494] system disk:vda1
I0320 10:29:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:29:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:29:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:29:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:29:16.458049 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:29:16.472361 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:29:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:29:23.409801 543705 memory.go:184] no items to output this cycle
I0320 10:29:23.409815 543705 cpu.go:275] no items to output this cycle
I0320 10:29:23.844341 543705 disk_info.go:125] begin check local disk info of client
I0320 10:29:23.846917 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:29:23.846922 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007bac0 0xc00007bb00]
E0320 10:29:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:29:33.409784 543705 memory.go:184] no items to output this cycle
I0320 10:29:33.409804 543705 cpu.go:275] no items to output this cycle
E0320 10:29:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:29:43.409818 543705 memory.go:191] Add success.
I0320 10:29:43.409831 543705 cpu.go:282] Add success.
I0320 10:29:43.419967 543705 net.go:648] Add success.
I0320 10:29:43.422491 543705 net.go:770] primary dev: ETH0
I0320 10:29:43.422504 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:29:43.422516 543705 net.go:698] Add success.
I0320 10:29:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:29:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:29:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:29:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:29:53.409800 543705 memory.go:184] no items to output this cycle
I0320 10:29:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 10:30:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:30:03.409799 543705 memory.go:184] no items to output this cycle
I0320 10:30:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 10:30:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:30:13.409790 543705 memory.go:191] Add success.
I0320 10:30:13.409797 543705 cpu.go:282] Add success.
W0320 10:30:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:30:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:30:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:30:13.419737 543705 net.go:648] Add success.
I0320 10:30:13.422394 543705 net.go:770] primary dev: ETH0
I0320 10:30:13.422407 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:30:13.422418 543705 net.go:698] Add success.
I0320 10:30:13.517518 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d4622b4f-f40d-44f2-bc96-9ebca161ccb6","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 10:30:13.517550 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 10:30:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:30:14.455130 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:30:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0320 10:30:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:30:14.456770 543705 disk_worker.go:494] system disk:vda1
I0320 10:30:14.456807 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:30:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:30:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:30:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:30:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:30:16.472385 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:30:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:30:23.409783 543705 memory.go:184] no items to output this cycle
I0320 10:30:23.409801 543705 cpu.go:275] no items to output this cycle
I0320 10:30:23.847006 543705 disk_info.go:125] begin check local disk info of client
I0320 10:30:23.849569 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:30:23.849575 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fba40 0xc0001fba80]
E0320 10:30:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:30:33.409773 543705 memory.go:184] no items to output this cycle
I0320 10:30:33.409793 543705 cpu.go:275] no items to output this cycle
I0320 10:30:38.430908 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 10:30:38.430914 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 10:30:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:30:43.410624 543705 memory.go:191] Add success.
I0320 10:30:43.409801 543705 cpu.go:282] Add success.
I0320 10:30:43.420360 543705 net.go:648] Add success.
I0320 10:30:43.423017 543705 net.go:770] primary dev: ETH0
I0320 10:30:43.423031 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:30:43.423044 543705 net.go:698] Add success.
I0320 10:30:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:30:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:30:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:30:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:30:53.409801 543705 memory.go:184] no items to output this cycle
I0320 10:30:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 10:31:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:31:03.409789 543705 memory.go:184] no items to output this cycle
I0320 10:31:03.409792 543705 cpu.go:275] no items to output this cycle
W0320 10:31:13.409719 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:31:13.409737 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:31:13.409744 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:31:13.409805 543705 cpu.go:282] Add success.
E0320 10:31:13.409842 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:31:13.409864 543705 memory.go:191] Add success.
I0320 10:31:13.420090 543705 net.go:648] Add success.
I0320 10:31:13.422736 543705 net.go:770] primary dev: ETH0
I0320 10:31:13.422750 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:31:13.422762 543705 net.go:698] Add success.
I0320 10:31:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:31:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:31:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 10:31:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:31:14.456576 543705 disk_worker.go:494] system disk:vda1
I0320 10:31:14.456608 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:31:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:31:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:31:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:31:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:31:16.472404 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:31:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:31:23.409788 543705 memory.go:184] no items to output this cycle
I0320 10:31:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 10:31:23.849673 543705 disk_info.go:125] begin check local disk info of client
I0320 10:31:23.852197 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:31:23.852203 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5340 0xc0000c5380]
E0320 10:31:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:31:33.409784 543705 memory.go:184] no items to output this cycle
I0320 10:31:33.409798 543705 cpu.go:275] no items to output this cycle
E0320 10:31:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:31:43.409797 543705 memory.go:191] Add success.
I0320 10:31:43.409824 543705 cpu.go:282] Add success.
I0320 10:31:43.419985 543705 net.go:648] Add success.
I0320 10:31:43.422591 543705 net.go:770] primary dev: ETH0
I0320 10:31:43.422604 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:31:43.422618 543705 net.go:698] Add success.
I0320 10:31:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:31:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:31:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:31:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:31:53.409790 543705 memory.go:184] no items to output this cycle
I0320 10:31:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 10:32:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:32:03.409813 543705 memory.go:184] no items to output this cycle
I0320 10:32:03.409823 543705 cpu.go:275] no items to output this cycle
E0320 10:32:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:32:13.409793 543705 memory.go:191] Add success.
I0320 10:32:13.409795 543705 cpu.go:282] Add success.
W0320 10:32:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:32:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:32:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:32:13.420281 543705 net.go:648] Add success.
I0320 10:32:13.423318 543705 net.go:770] primary dev: ETH0
I0320 10:32:13.423333 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:32:13.423346 543705 net.go:698] Add success.
W0320 10:32:14.455154 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:32:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 10:32:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:32:14.456796 543705 disk_worker.go:494] system disk:vda1
I0320 10:32:14.456835 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 10:32:14.457052 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 10:32:14.457060 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 10:32:14.457064 543705 custom_config.go:64] query custom config with name: gpu
E0320 10:32:15.456879 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 10:32:15.456889 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:32:16.457948 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 10:32:16.457959 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 10:32:16.458000 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:32:16.458016 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:32:16.472340 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:32:23.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:32:23.409802 543705 memory.go:184] no items to output this cycle
I0320 10:32:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 10:32:23.853670 543705 disk_info.go:125] begin check local disk info of client
I0320 10:32:23.856187 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:32:23.856192 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5000 0xc0000c5040]
E0320 10:32:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:32:33.409780 543705 memory.go:184] no items to output this cycle
I0320 10:32:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 10:32:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:32:43.409787 543705 memory.go:191] Add success.
I0320 10:32:43.409815 543705 cpu.go:282] Add success.
I0320 10:32:43.420144 543705 net.go:648] Add success.
I0320 10:32:43.422693 543705 net.go:770] primary dev: ETH0
I0320 10:32:43.422708 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:32:43.422722 543705 net.go:698] Add success.
I0320 10:32:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:32:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:32:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:32:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:32:53.409809 543705 memory.go:184] no items to output this cycle
I0320 10:32:53.409829 543705 cpu.go:275] no items to output this cycle
E0320 10:33:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:33:03.409778 543705 cpu.go:275] no items to output this cycle
I0320 10:33:03.409794 543705 memory.go:184] no items to output this cycle
E0320 10:33:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:33:13.409786 543705 memory.go:191] Add success.
I0320 10:33:13.409788 543705 cpu.go:282] Add success.
W0320 10:33:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:33:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:33:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:33:13.420193 543705 net.go:648] Add success.
I0320 10:33:13.423200 543705 net.go:770] primary dev: ETH0
I0320 10:33:13.423216 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:33:13.423230 543705 net.go:698] Add success.
I0320 10:33:13.548301 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"856d2b35-c5d8-468e-a8fe-fa9ba78a8309","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 10:33:13.548338 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 10:33:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:33:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:33:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 10:33:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:33:14.456699 543705 disk_worker.go:494] system disk:vda1
I0320 10:33:14.456735 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:33:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:33:16.457999 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:33:16.458064 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:33:16.458087 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:33:16.472423 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:33:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:33:23.409778 543705 memory.go:184] no items to output this cycle
I0320 10:33:23.409807 543705 cpu.go:275] no items to output this cycle
I0320 10:33:23.856272 543705 disk_info.go:125] begin check local disk info of client
I0320 10:33:23.858830 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:33:23.858835 543705 disk_info.go:196] parse disk info done, disk is : [0xc00046b740 0xc00046b780]
E0320 10:33:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:33:33.409786 543705 cpu.go:275] no items to output this cycle
I0320 10:33:33.409788 543705 memory.go:184] no items to output this cycle
I0320 10:33:38.431933 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 10:33:38.431941 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 10:33:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:33:43.410631 543705 memory.go:191] Add success.
I0320 10:33:43.409797 543705 cpu.go:282] Add success.
I0320 10:33:43.420312 543705 net.go:648] Add success.
I0320 10:33:43.422752 543705 net.go:770] primary dev: ETH0
I0320 10:33:43.422765 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:33:43.422778 543705 net.go:698] Add success.
I0320 10:33:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:33:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:33:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:33:53.409839 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:33:53.409857 543705 memory.go:184] no items to output this cycle
I0320 10:33:53.409980 543705 cpu.go:275] no items to output this cycle
E0320 10:34:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:34:03.409799 543705 memory.go:184] no items to output this cycle
I0320 10:34:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 10:34:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:34:13.409789 543705 memory.go:191] Add success.
I0320 10:34:13.409794 543705 cpu.go:282] Add success.
W0320 10:34:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:34:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:34:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:34:13.420048 543705 net.go:648] Add success.
I0320 10:34:13.422723 543705 net.go:770] primary dev: ETH0
I0320 10:34:13.422735 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:34:13.422747 543705 net.go:698] Add success.
I0320 10:34:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:34:14.455108 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:34:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 10:34:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:34:14.456559 543705 disk_worker.go:494] system disk:vda1
I0320 10:34:14.456590 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:34:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:34:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:34:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:34:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:34:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:34:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:34:23.409806 543705 memory.go:184] no items to output this cycle
I0320 10:34:23.409819 543705 cpu.go:275] no items to output this cycle
I0320 10:34:23.860426 543705 disk_info.go:125] begin check local disk info of client
I0320 10:34:23.862957 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:34:23.862962 543705 disk_info.go:196] parse disk info done, disk is : [0xc000254100 0xc000254140]
E0320 10:34:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:34:33.409800 543705 memory.go:184] no items to output this cycle
I0320 10:34:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 10:34:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:34:43.409797 543705 memory.go:191] Add success.
I0320 10:34:43.409797 543705 cpu.go:282] Add success.
I0320 10:34:43.420071 543705 net.go:648] Add success.
I0320 10:34:43.422862 543705 net.go:770] primary dev: ETH0
I0320 10:34:43.422877 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:34:43.422891 543705 net.go:698] Add success.
I0320 10:34:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:34:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:34:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:34:53.410665 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:34:53.410689 543705 memory.go:184] no items to output this cycle
I0320 10:34:53.410705 543705 cpu.go:275] no items to output this cycle
E0320 10:35:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:35:03.409767 543705 memory.go:184] no items to output this cycle
I0320 10:35:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 10:35:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:35:13.409821 543705 memory.go:191] Add success.
I0320 10:35:13.409830 543705 cpu.go:282] Add success.
W0320 10:35:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:35:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:35:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:35:13.420136 543705 net.go:648] Add success.
I0320 10:35:13.423130 543705 net.go:770] primary dev: ETH0
I0320 10:35:13.423143 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:35:13.423155 543705 net.go:698] Add success.
I0320 10:35:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:35:14.455197 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:35:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0320 10:35:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:35:14.456603 543705 disk_worker.go:494] system disk:vda1
I0320 10:35:14.456633 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:35:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:35:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:35:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:35:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:35:16.472422 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:35:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:35:23.409787 543705 cpu.go:275] no items to output this cycle
I0320 10:35:23.409789 543705 memory.go:184] no items to output this cycle
I0320 10:35:23.863053 543705 disk_info.go:125] begin check local disk info of client
I0320 10:35:23.865625 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:35:23.865632 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003425c0 0xc000342600]
E0320 10:35:33.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:35:33.409762 543705 memory.go:184] no items to output this cycle
I0320 10:35:33.409804 543705 cpu.go:275] no items to output this cycle
E0320 10:35:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:35:43.409813 543705 memory.go:191] Add success.
I0320 10:35:43.409819 543705 cpu.go:282] Add success.
I0320 10:35:43.419997 543705 net.go:648] Add success.
I0320 10:35:43.422641 543705 net.go:770] primary dev: ETH0
I0320 10:35:43.422654 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:35:43.422666 543705 net.go:698] Add success.
I0320 10:35:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:35:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:35:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:35:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:35:53.409781 543705 memory.go:184] no items to output this cycle
I0320 10:35:53.409819 543705 cpu.go:275] no items to output this cycle
E0320 10:36:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:36:03.409779 543705 memory.go:184] no items to output this cycle
I0320 10:36:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 10:36:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:36:13.409815 543705 memory.go:191] Add success.
I0320 10:36:13.409831 543705 cpu.go:282] Add success.
W0320 10:36:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:36:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:36:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:36:13.420163 543705 net.go:648] Add success.
I0320 10:36:13.423174 543705 net.go:770] primary dev: ETH0
I0320 10:36:13.423187 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:36:13.423200 543705 net.go:698] Add success.
I0320 10:36:13.467839 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"26ce0f79-de72-4247-892b-c55cb256054d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 10:36:13.467873 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 10:36:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:36:14.455202 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:36:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0320 10:36:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:36:14.456603 543705 disk_worker.go:494] system disk:vda1
I0320 10:36:14.456636 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:36:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:36:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:36:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:36:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:36:16.472408 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:36:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:36:23.409782 543705 memory.go:184] no items to output this cycle
I0320 10:36:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 10:36:23.865678 543705 disk_info.go:125] begin check local disk info of client
I0320 10:36:23.868229 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:36:23.868235 543705 disk_info.go:196] parse disk info done, disk is : [0xc000306740 0xc000306780]
E0320 10:36:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:36:33.409799 543705 memory.go:184] no items to output this cycle
I0320 10:36:33.409818 543705 cpu.go:275] no items to output this cycle
I0320 10:36:38.432920 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 10:36:38.432926 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 10:36:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:36:43.410744 543705 memory.go:191] Add success.
I0320 10:36:43.409796 543705 cpu.go:282] Add success.
I0320 10:36:43.420761 543705 net.go:648] Add success.
I0320 10:36:43.423598 543705 net.go:770] primary dev: ETH0
I0320 10:36:43.423611 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:36:43.423623 543705 net.go:698] Add success.
I0320 10:36:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:36:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:36:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:36:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:36:53.409769 543705 memory.go:184] no items to output this cycle
I0320 10:36:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 10:37:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:37:03.409776 543705 memory.go:184] no items to output this cycle
I0320 10:37:03.409782 543705 cpu.go:275] no items to output this cycle
E0320 10:37:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:37:13.409793 543705 memory.go:191] Add success.
I0320 10:37:13.409794 543705 cpu.go:282] Add success.
W0320 10:37:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:37:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:37:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:37:13.420061 543705 net.go:648] Add success.
I0320 10:37:13.422848 543705 net.go:770] primary dev: ETH0
I0320 10:37:13.422863 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:37:13.422877 543705 net.go:698] Add success.
I0320 10:37:13.453445 543705 event_worker.go:152] Polling the log file for events...
W0320 10:37:14.455152 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:37:14.455164 543705 disk_worker.go:708] disk space is not compliant
W0320 10:37:14.455167 543705 disk_worker.go:728] disk inode is not compliant
E0320 10:37:14.456881 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 10:37:14.456890 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 10:37:14.456897 543705 custom_config.go:64] query custom config with name: gpu
I0320 10:37:14.456967 543705 disk_worker.go:494] system disk:vda1
I0320 10:37:14.457012 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 10:37:15.456843 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 10:37:15.456851 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:37:16.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 10:37:16.458000 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 10:37:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:37:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:37:16.472398 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:37:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:37:23.409804 543705 memory.go:184] no items to output this cycle
I0320 10:37:23.409814 543705 cpu.go:275] no items to output this cycle
I0320 10:37:23.869673 543705 disk_info.go:125] begin check local disk info of client
I0320 10:37:23.872262 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:37:23.872268 543705 disk_info.go:196] parse disk info done, disk is : [0xc000305900 0xc000305940]
E0320 10:37:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:37:33.409778 543705 memory.go:184] no items to output this cycle
I0320 10:37:33.409784 543705 cpu.go:275] no items to output this cycle
E0320 10:37:43.409857 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:37:43.409886 543705 memory.go:191] Add success.
I0320 10:37:43.409965 543705 cpu.go:282] Add success.
I0320 10:37:43.419727 543705 net.go:648] Add success.
I0320 10:37:43.422902 543705 net.go:770] primary dev: ETH0
I0320 10:37:43.422914 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:37:43.422926 543705 net.go:698] Add success.
I0320 10:37:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:37:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:37:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:37:53.410699 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:37:53.410715 543705 memory.go:184] no items to output this cycle
I0320 10:37:53.410720 543705 cpu.go:275] no items to output this cycle
E0320 10:38:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:38:03.409766 543705 memory.go:184] no items to output this cycle
I0320 10:38:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 10:38:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:38:13.409823 543705 memory.go:191] Add success.
I0320 10:38:13.409826 543705 cpu.go:282] Add success.
W0320 10:38:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:38:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:38:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:38:13.420110 543705 net.go:648] Add success.
I0320 10:38:13.423265 543705 net.go:770] primary dev: ETH0
I0320 10:38:13.423281 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:38:13.423294 543705 net.go:698] Add success.
I0320 10:38:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:38:14.455104 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:38:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0320 10:38:14.455170 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:38:14.456503 543705 disk_worker.go:494] system disk:vda1
I0320 10:38:14.456548 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:38:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:38:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:38:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:38:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:38:16.472375 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:38:23.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:38:23.409821 543705 memory.go:184] no items to output this cycle
I0320 10:38:23.409826 543705 cpu.go:275] no items to output this cycle
I0320 10:38:23.873676 543705 disk_info.go:125] begin check local disk info of client
I0320 10:38:23.876229 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:38:23.876234 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003af580 0xc0003af5c0]
E0320 10:38:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:38:33.409813 543705 memory.go:184] no items to output this cycle
I0320 10:38:33.409932 543705 cpu.go:275] no items to output this cycle
E0320 10:38:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:38:43.409804 543705 memory.go:191] Add success.
I0320 10:38:43.409821 543705 cpu.go:282] Add success.
I0320 10:38:43.420002 543705 net.go:648] Add success.
I0320 10:38:43.423060 543705 net.go:770] primary dev: ETH0
I0320 10:38:43.423075 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:38:43.423089 543705 net.go:698] Add success.
I0320 10:38:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:38:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:38:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:38:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:38:53.409791 543705 memory.go:184] no items to output this cycle
I0320 10:38:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 10:39:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:39:03.409781 543705 memory.go:184] no items to output this cycle
I0320 10:39:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 10:39:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:39:13.409808 543705 memory.go:191] Add success.
I0320 10:39:13.409808 543705 cpu.go:282] Add success.
W0320 10:39:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:39:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:39:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:39:13.420175 543705 net.go:648] Add success.
I0320 10:39:13.423287 543705 net.go:770] primary dev: ETH0
I0320 10:39:13.423299 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:39:13.423311 543705 net.go:698] Add success.
I0320 10:39:13.463158 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"71d6ceca-c6cf-4408-a85a-3348282194f2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 10:39:13.463190 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 10:39:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:39:14.455212 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:39:14.455223 543705 disk_worker.go:708] disk space is not compliant
W0320 10:39:14.455226 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:39:14.456768 543705 disk_worker.go:494] system disk:vda1
I0320 10:39:14.456797 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:39:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:39:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:39:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:39:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:39:16.472386 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:39:23.409899 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:39:23.409922 543705 memory.go:184] no items to output this cycle
I0320 10:39:23.409934 543705 cpu.go:275] no items to output this cycle
I0320 10:39:23.877666 543705 disk_info.go:125] begin check local disk info of client
I0320 10:39:23.880182 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:39:23.880187 543705 disk_info.go:196] parse disk info done, disk is : [0xc000352540 0xc000352580]
E0320 10:39:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:39:33.409809 543705 memory.go:184] no items to output this cycle
I0320 10:39:33.409827 543705 cpu.go:275] no items to output this cycle
I0320 10:39:38.433929 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 10:39:38.433937 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 10:39:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:39:43.410653 543705 memory.go:191] Add success.
I0320 10:39:43.409837 543705 cpu.go:282] Add success.
I0320 10:39:43.420366 543705 net.go:648] Add success.
I0320 10:39:43.422948 543705 net.go:770] primary dev: ETH0
I0320 10:39:43.422963 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:39:43.422979 543705 net.go:698] Add success.
I0320 10:39:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:39:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:39:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:39:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:39:53.409766 543705 memory.go:184] no items to output this cycle
I0320 10:39:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 10:40:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:40:03.409782 543705 memory.go:184] no items to output this cycle
I0320 10:40:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 10:40:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:40:13.409785 543705 memory.go:191] Add success.
I0320 10:40:13.409788 543705 cpu.go:282] Add success.
W0320 10:40:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:40:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:40:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:40:13.420078 543705 net.go:648] Add success.
I0320 10:40:13.422986 543705 net.go:770] primary dev: ETH0
I0320 10:40:13.423004 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:40:13.423018 543705 net.go:698] Add success.
I0320 10:40:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:40:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:40:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 10:40:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:40:14.456555 543705 disk_worker.go:494] system disk:vda1
I0320 10:40:14.456585 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:40:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:40:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:40:16.458024 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:40:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:40:16.472366 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:40:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:40:23.409806 543705 memory.go:184] no items to output this cycle
I0320 10:40:23.409816 543705 cpu.go:275] no items to output this cycle
I0320 10:40:23.880279 543705 disk_info.go:125] begin check local disk info of client
I0320 10:40:23.882816 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:40:23.882822 543705 disk_info.go:196] parse disk info done, disk is : [0xc000364000 0xc000364040]
E0320 10:40:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:40:33.409781 543705 memory.go:184] no items to output this cycle
I0320 10:40:33.409782 543705 cpu.go:275] no items to output this cycle
E0320 10:40:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:40:43.409817 543705 memory.go:191] Add success.
I0320 10:40:43.409825 543705 cpu.go:282] Add success.
I0320 10:40:43.420082 543705 net.go:648] Add success.
I0320 10:40:43.423030 543705 net.go:770] primary dev: ETH0
I0320 10:40:43.423042 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:40:43.423055 543705 net.go:698] Add success.
I0320 10:40:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:40:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:40:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:40:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:40:53.409779 543705 cpu.go:275] no items to output this cycle
I0320 10:40:53.409783 543705 memory.go:184] no items to output this cycle
E0320 10:41:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:41:03.409799 543705 memory.go:184] no items to output this cycle
I0320 10:41:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 10:41:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:41:13.409785 543705 cpu.go:282] Add success.
I0320 10:41:13.409796 543705 memory.go:191] Add success.
W0320 10:41:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:41:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:41:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:41:13.420141 543705 net.go:648] Add success.
I0320 10:41:13.422980 543705 net.go:770] primary dev: ETH0
I0320 10:41:13.422995 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:41:13.423009 543705 net.go:698] Add success.
I0320 10:41:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:41:14.455139 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:41:14.455149 543705 disk_worker.go:708] disk space is not compliant
W0320 10:41:14.455153 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:41:14.456489 543705 disk_worker.go:494] system disk:vda1
I0320 10:41:14.456532 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:41:15.455976 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:41:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:41:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:41:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:41:16.472386 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:41:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:41:23.409779 543705 memory.go:184] no items to output this cycle
I0320 10:41:23.409807 543705 cpu.go:275] no items to output this cycle
I0320 10:41:23.884551 543705 disk_info.go:125] begin check local disk info of client
I0320 10:41:23.887081 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:41:23.887088 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004987c0 0xc000498800]
E0320 10:41:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:41:33.409783 543705 memory.go:184] no items to output this cycle
I0320 10:41:33.409791 543705 cpu.go:275] no items to output this cycle
E0320 10:41:43.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:41:43.409833 543705 memory.go:191] Add success.
I0320 10:41:43.409839 543705 cpu.go:282] Add success.
I0320 10:41:43.419977 543705 net.go:648] Add success.
I0320 10:41:43.422964 543705 net.go:770] primary dev: ETH0
I0320 10:41:43.422977 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:41:43.422989 543705 net.go:698] Add success.
I0320 10:41:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:41:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:41:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:41:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:41:53.409796 543705 memory.go:184] no items to output this cycle
I0320 10:41:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 10:42:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:42:03.409779 543705 memory.go:184] no items to output this cycle
I0320 10:42:03.409784 543705 cpu.go:275] no items to output this cycle
E0320 10:42:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:42:13.409787 543705 memory.go:191] Add success.
I0320 10:42:13.409794 543705 cpu.go:282] Add success.
W0320 10:42:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:42:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:42:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:42:13.420206 543705 net.go:648] Add success.
I0320 10:42:13.422936 543705 net.go:770] primary dev: ETH0
I0320 10:42:13.422948 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:42:13.422959 543705 net.go:698] Add success.
I0320 10:42:13.469516 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"dd083173-ef5d-4373-ab49-3b1318d162d0","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 10:42:13.469552 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 10:42:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:42:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 10:42:14.455188 543705 disk_worker.go:728] disk inode is not compliant
E0320 10:42:14.456856 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
I0320 10:42:14.456861 543705 disk_worker.go:494] system disk:vda1
E0320 10:42:14.456865 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 10:42:14.456870 543705 custom_config.go:64] query custom config with name: gpu
I0320 10:42:14.456914 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 10:42:15.456805 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 10:42:15.456814 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:42:16.457918 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 10:42:16.457918 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 10:42:16.457971 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:42:16.458002 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:42:16.472383 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:42:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:42:23.409796 543705 memory.go:184] no items to output this cycle
I0320 10:42:23.409835 543705 cpu.go:275] no items to output this cycle
I0320 10:42:23.888646 543705 disk_info.go:125] begin check local disk info of client
I0320 10:42:23.891212 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:42:23.891217 543705 disk_info.go:196] parse disk info done, disk is : [0xc000486b00 0xc000486b40]
E0320 10:42:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:42:33.409776 543705 memory.go:184] no items to output this cycle
I0320 10:42:33.409789 543705 cpu.go:275] no items to output this cycle
I0320 10:42:38.434926 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 10:42:38.434932 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 10:42:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:42:43.410628 543705 memory.go:191] Add success.
I0320 10:42:43.409805 543705 cpu.go:282] Add success.
I0320 10:42:43.420350 543705 net.go:648] Add success.
I0320 10:42:43.422940 543705 net.go:770] primary dev: ETH0
I0320 10:42:43.422956 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:42:43.422971 543705 net.go:698] Add success.
I0320 10:42:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:42:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:42:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:42:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:42:53.409776 543705 memory.go:184] no items to output this cycle
I0320 10:42:53.409780 543705 cpu.go:275] no items to output this cycle
E0320 10:43:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:43:03.409778 543705 memory.go:184] no items to output this cycle
I0320 10:43:03.409780 543705 cpu.go:275] no items to output this cycle
E0320 10:43:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:43:13.409793 543705 memory.go:191] Add success.
I0320 10:43:13.409797 543705 cpu.go:282] Add success.
W0320 10:43:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:43:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:43:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:43:13.420063 543705 net.go:648] Add success.
I0320 10:43:13.422793 543705 net.go:770] primary dev: ETH0
I0320 10:43:13.422806 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:43:13.422818 543705 net.go:698] Add success.
I0320 10:43:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:43:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:43:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0320 10:43:14.455176 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:43:14.456515 543705 disk_worker.go:494] system disk:vda1
I0320 10:43:14.456562 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:43:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:43:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:43:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:43:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:43:16.472414 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:43:23.410254 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:43:23.410273 543705 memory.go:184] no items to output this cycle
I0320 10:43:23.410292 543705 cpu.go:275] no items to output this cycle
I0320 10:43:23.893109 543705 disk_info.go:125] begin check local disk info of client
I0320 10:43:23.895651 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:43:23.895657 543705 disk_info.go:196] parse disk info done, disk is : [0xc000471040 0xc000471080]
E0320 10:43:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:43:33.409806 543705 memory.go:184] no items to output this cycle
I0320 10:43:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 10:43:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:43:43.409891 543705 memory.go:191] Add success.
I0320 10:43:43.409950 543705 cpu.go:282] Add success.
I0320 10:43:43.419726 543705 net.go:648] Add success.
I0320 10:43:43.422630 543705 net.go:770] primary dev: ETH0
I0320 10:43:43.422646 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:43:43.422660 543705 net.go:698] Add success.
I0320 10:43:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:43:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:43:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:43:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:43:53.409778 543705 memory.go:184] no items to output this cycle
I0320 10:43:53.409778 543705 cpu.go:275] no items to output this cycle
E0320 10:44:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:44:03.409776 543705 memory.go:184] no items to output this cycle
I0320 10:44:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 10:44:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:44:13.409793 543705 memory.go:191] Add success.
I0320 10:44:13.409796 543705 cpu.go:282] Add success.
W0320 10:44:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:44:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:44:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:44:13.420111 543705 net.go:648] Add success.
I0320 10:44:13.422892 543705 net.go:770] primary dev: ETH0
I0320 10:44:13.422911 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:44:13.422926 543705 net.go:698] Add success.
I0320 10:44:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:44:14.455215 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:44:14.455226 543705 disk_worker.go:708] disk space is not compliant
W0320 10:44:14.455228 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:44:14.456610 543705 disk_worker.go:494] system disk:vda1
I0320 10:44:14.456640 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:44:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:44:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:44:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:44:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:44:16.472378 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:44:23.410345 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:44:23.410363 543705 memory.go:184] no items to output this cycle
I0320 10:44:23.410384 543705 cpu.go:275] no items to output this cycle
I0320 10:44:23.897210 543705 disk_info.go:125] begin check local disk info of client
I0320 10:44:23.899738 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:44:23.899744 543705 disk_info.go:196] parse disk info done, disk is : [0xc000384200 0xc000384240]
E0320 10:44:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:44:33.409778 543705 memory.go:184] no items to output this cycle
I0320 10:44:33.409784 543705 cpu.go:275] no items to output this cycle
E0320 10:44:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:44:43.409799 543705 memory.go:191] Add success.
I0320 10:44:43.409799 543705 cpu.go:282] Add success.
I0320 10:44:43.420152 543705 net.go:648] Add success.
I0320 10:44:43.423076 543705 net.go:770] primary dev: ETH0
I0320 10:44:43.423089 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:44:43.423101 543705 net.go:698] Add success.
I0320 10:44:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:44:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:44:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:44:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:44:53.409791 543705 memory.go:184] no items to output this cycle
I0320 10:44:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 10:45:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:45:03.409774 543705 memory.go:184] no items to output this cycle
I0320 10:45:03.409778 543705 cpu.go:275] no items to output this cycle
E0320 10:45:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:45:13.409786 543705 memory.go:191] Add success.
I0320 10:45:13.409800 543705 cpu.go:282] Add success.
W0320 10:45:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:45:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:45:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:45:13.420128 543705 net.go:648] Add success.
I0320 10:45:13.422993 543705 net.go:770] primary dev: ETH0
I0320 10:45:13.423008 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:45:13.423021 543705 net.go:698] Add success.
I0320 10:45:13.471380 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5b3c22e2-3926-4a9f-8b0f-53e566d8de16","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 10:45:13.471414 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 10:45:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:45:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:45:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0320 10:45:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:45:14.456615 543705 disk_worker.go:494] system disk:vda1
I0320 10:45:14.456644 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:45:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:45:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:45:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:45:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:45:16.472400 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:45:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:45:23.409794 543705 memory.go:184] no items to output this cycle
I0320 10:45:23.409797 543705 cpu.go:275] no items to output this cycle
I0320 10:45:23.899825 543705 disk_info.go:125] begin check local disk info of client
I0320 10:45:23.902386 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:45:23.902391 543705 disk_info.go:196] parse disk info done, disk is : [0xc000546f40 0xc000546f80]
E0320 10:45:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:45:33.409767 543705 memory.go:184] no items to output this cycle
I0320 10:45:33.409804 543705 cpu.go:275] no items to output this cycle
I0320 10:45:38.435928 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 10:45:38.435936 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 10:45:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:45:43.410756 543705 memory.go:191] Add success.
I0320 10:45:43.409814 543705 cpu.go:282] Add success.
I0320 10:45:43.419729 543705 net.go:648] Add success.
I0320 10:45:43.422370 543705 net.go:770] primary dev: ETH0
I0320 10:45:43.422383 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:45:43.422394 543705 net.go:698] Add success.
I0320 10:45:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:45:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:45:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:45:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:45:53.409768 543705 memory.go:184] no items to output this cycle
I0320 10:45:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 10:46:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:46:03.409804 543705 memory.go:184] no items to output this cycle
I0320 10:46:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 10:46:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:46:13.409780 543705 memory.go:191] Add success.
W0320 10:46:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 10:46:13.409810 543705 cpu.go:282] Add success.
W0320 10:46:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:46:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:46:13.420131 543705 net.go:648] Add success.
I0320 10:46:13.422753 543705 net.go:770] primary dev: ETH0
I0320 10:46:13.422766 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:46:13.422777 543705 net.go:698] Add success.
I0320 10:46:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:46:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:46:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0320 10:46:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:46:14.456521 543705 disk_worker.go:494] system disk:vda1
I0320 10:46:14.456564 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:46:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:46:16.458251 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:46:16.458320 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:46:16.458355 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:46:16.472675 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:46:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:46:23.409787 543705 memory.go:184] no items to output this cycle
I0320 10:46:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 10:46:23.904616 543705 disk_info.go:125] begin check local disk info of client
I0320 10:46:23.907684 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:46:23.907690 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002ac580 0xc0002ac5c0]
E0320 10:46:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:46:33.409799 543705 memory.go:184] no items to output this cycle
I0320 10:46:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 10:46:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:46:43.409793 543705 memory.go:191] Add success.
I0320 10:46:43.409806 543705 cpu.go:282] Add success.
I0320 10:46:43.419730 543705 net.go:648] Add success.
I0320 10:46:43.422699 543705 net.go:770] primary dev: ETH0
I0320 10:46:43.422713 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:46:43.422724 543705 net.go:698] Add success.
I0320 10:46:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:46:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:46:46.458088 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:46:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:46:53.409779 543705 cpu.go:275] no items to output this cycle
I0320 10:46:53.409784 543705 memory.go:184] no items to output this cycle
E0320 10:47:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:47:03.409792 543705 memory.go:184] no items to output this cycle
I0320 10:47:03.409806 543705 cpu.go:275] no items to output this cycle
E0320 10:47:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:47:13.409779 543705 memory.go:191] Add success.
I0320 10:47:13.409797 543705 cpu.go:282] Add success.
W0320 10:47:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:47:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:47:13.409818 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:47:13.420218 543705 net.go:648] Add success.
I0320 10:47:13.423907 543705 net.go:770] primary dev: ETH0
I0320 10:47:13.423926 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:47:13.423948 543705 net.go:698] Add success.
I0320 10:47:13.453504 543705 event_worker.go:152] Polling the log file for events...
W0320 10:47:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:47:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 10:47:14.455182 543705 disk_worker.go:728] disk inode is not compliant
E0320 10:47:14.455895 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 10:47:14.455904 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 10:47:14.455910 543705 custom_config.go:64] query custom config with name: gpu
I0320 10:47:14.456554 543705 disk_worker.go:494] system disk:vda1
I0320 10:47:14.456584 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 10:47:15.456834 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 10:47:15.456842 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:47:16.457929 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 10:47:16.457929 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 10:47:16.457985 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:47:16.458009 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:47:16.472327 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:47:23.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:47:23.409815 543705 memory.go:184] no items to output this cycle
I0320 10:47:23.409822 543705 cpu.go:275] no items to output this cycle
I0320 10:47:23.909672 543705 disk_info.go:125] begin check local disk info of client
I0320 10:47:23.912256 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:47:23.912262 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002acc80 0xc0002accc0]
E0320 10:47:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:47:33.409781 543705 memory.go:184] no items to output this cycle
I0320 10:47:33.409783 543705 cpu.go:275] no items to output this cycle
E0320 10:47:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:47:43.409820 543705 memory.go:191] Add success.
I0320 10:47:43.409842 543705 cpu.go:282] Add success.
I0320 10:47:43.419977 543705 net.go:648] Add success.
I0320 10:47:43.422498 543705 net.go:770] primary dev: ETH0
I0320 10:47:43.422511 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:47:43.422524 543705 net.go:698] Add success.
I0320 10:47:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:47:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:47:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:47:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:47:53.409799 543705 memory.go:184] no items to output this cycle
I0320 10:47:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 10:48:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:48:03.409778 543705 cpu.go:275] no items to output this cycle
I0320 10:48:03.409788 543705 memory.go:184] no items to output this cycle
E0320 10:48:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:48:13.409816 543705 memory.go:191] Add success.
I0320 10:48:13.409818 543705 cpu.go:282] Add success.
W0320 10:48:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:48:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:48:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:48:13.420297 543705 net.go:648] Add success.
I0320 10:48:13.423342 543705 net.go:770] primary dev: ETH0
I0320 10:48:13.423358 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:48:13.423372 543705 net.go:698] Add success.
I0320 10:48:13.699791 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"bca8e649-85bf-43a7-b43b-bcad7e52d0cd","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 10:48:13.699834 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 10:48:14.454679 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:48:14.454928 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:48:14.454937 543705 disk_worker.go:708] disk space is not compliant
W0320 10:48:14.454940 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:48:14.456516 543705 disk_worker.go:494] system disk:vda1
I0320 10:48:14.456553 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:48:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:48:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:48:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:48:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:48:16.472377 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:48:23.410240 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:48:23.410259 543705 memory.go:184] no items to output this cycle
I0320 10:48:23.410260 543705 cpu.go:275] no items to output this cycle
I0320 10:48:23.913668 543705 disk_info.go:125] begin check local disk info of client
I0320 10:48:23.916191 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:48:23.916196 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa900 0xc0001aa940]
E0320 10:48:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:48:33.409784 543705 cpu.go:275] no items to output this cycle
I0320 10:48:33.409792 543705 memory.go:184] no items to output this cycle
I0320 10:48:38.436945 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 10:48:38.436953 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 10:48:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:48:43.410666 543705 memory.go:191] Add success.
I0320 10:48:43.409802 543705 cpu.go:282] Add success.
I0320 10:48:43.420473 543705 net.go:648] Add success.
I0320 10:48:43.423207 543705 net.go:770] primary dev: ETH0
I0320 10:48:43.423220 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:48:43.423232 543705 net.go:698] Add success.
I0320 10:48:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:48:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:48:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:48:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:48:53.409772 543705 memory.go:184] no items to output this cycle
I0320 10:48:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 10:49:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:49:03.409788 543705 cpu.go:275] no items to output this cycle
I0320 10:49:03.409791 543705 memory.go:184] no items to output this cycle
E0320 10:49:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:49:13.409832 543705 memory.go:191] Add success.
I0320 10:49:13.409840 543705 cpu.go:282] Add success.
W0320 10:49:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:49:13.409881 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:49:13.409886 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:49:13.420142 543705 net.go:648] Add success.
I0320 10:49:13.422795 543705 net.go:770] primary dev: ETH0
I0320 10:49:13.422808 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:49:13.422820 543705 net.go:698] Add success.
I0320 10:49:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:49:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:49:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 10:49:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:49:14.456550 543705 disk_worker.go:494] system disk:vda1
I0320 10:49:14.456585 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:49:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:49:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:49:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:49:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:49:16.472410 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:49:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:49:23.409788 543705 memory.go:184] no items to output this cycle
I0320 10:49:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 10:49:23.917669 543705 disk_info.go:125] begin check local disk info of client
I0320 10:49:23.920194 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:49:23.920199 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001faf00 0xc0001faf40]
E0320 10:49:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:49:33.409776 543705 memory.go:184] no items to output this cycle
I0320 10:49:33.409835 543705 cpu.go:275] no items to output this cycle
E0320 10:49:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:49:43.409804 543705 memory.go:191] Add success.
I0320 10:49:43.409807 543705 cpu.go:282] Add success.
I0320 10:49:43.419962 543705 net.go:648] Add success.
I0320 10:49:43.422662 543705 net.go:770] primary dev: ETH0
I0320 10:49:43.422676 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:49:43.422689 543705 net.go:698] Add success.
I0320 10:49:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:49:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:49:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:49:53.409871 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:49:53.409894 543705 memory.go:184] no items to output this cycle
I0320 10:49:53.410033 543705 cpu.go:275] no items to output this cycle
E0320 10:50:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:50:03.409802 543705 memory.go:184] no items to output this cycle
I0320 10:50:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 10:50:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:50:13.409792 543705 memory.go:191] Add success.
I0320 10:50:13.409793 543705 cpu.go:282] Add success.
W0320 10:50:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:50:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:50:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:50:13.420177 543705 net.go:648] Add success.
I0320 10:50:13.423147 543705 net.go:770] primary dev: ETH0
I0320 10:50:13.423162 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:50:13.423178 543705 net.go:698] Add success.
I0320 10:50:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:50:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:50:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 10:50:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:50:14.456494 543705 disk_worker.go:494] system disk:vda1
I0320 10:50:14.456540 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:50:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:50:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:50:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:50:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:50:16.472396 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:50:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:50:23.409808 543705 memory.go:184] no items to output this cycle
I0320 10:50:23.409818 543705 cpu.go:275] no items to output this cycle
I0320 10:50:23.921668 543705 disk_info.go:125] begin check local disk info of client
I0320 10:50:23.924244 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:50:23.924250 543705 disk_info.go:196] parse disk info done, disk is : [0xc000392300 0xc000392340]
E0320 10:50:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:50:33.409809 543705 memory.go:184] no items to output this cycle
I0320 10:50:33.409821 543705 cpu.go:275] no items to output this cycle
E0320 10:50:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:50:43.409821 543705 memory.go:191] Add success.
I0320 10:50:43.409829 543705 cpu.go:282] Add success.
I0320 10:50:43.420038 543705 net.go:648] Add success.
I0320 10:50:43.422883 543705 net.go:770] primary dev: ETH0
I0320 10:50:43.422896 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:50:43.422908 543705 net.go:698] Add success.
I0320 10:50:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:50:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:50:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:50:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:50:53.409767 543705 memory.go:184] no items to output this cycle
I0320 10:50:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 10:51:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:51:03.409806 543705 memory.go:184] no items to output this cycle
I0320 10:51:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 10:51:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:51:13.409787 543705 memory.go:191] Add success.
I0320 10:51:13.409801 543705 cpu.go:282] Add success.
W0320 10:51:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:51:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:51:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:51:13.420155 543705 net.go:648] Add success.
I0320 10:51:13.422898 543705 net.go:770] primary dev: ETH0
I0320 10:51:13.422911 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:51:13.422923 543705 net.go:698] Add success.
I0320 10:51:13.469391 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"75ba7b4d-b809-434c-9ab8-0f0835f26791","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 10:51:13.469423 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 10:51:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:51:14.455210 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:51:14.455220 543705 disk_worker.go:708] disk space is not compliant
W0320 10:51:14.455223 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:51:14.456768 543705 disk_worker.go:494] system disk:vda1
I0320 10:51:14.456797 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:51:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:51:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:51:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:51:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:51:16.472386 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:51:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:51:23.409779 543705 memory.go:184] no items to output this cycle
I0320 10:51:23.409805 543705 cpu.go:275] no items to output this cycle
I0320 10:51:23.925671 543705 disk_info.go:125] begin check local disk info of client
I0320 10:51:23.928197 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:51:23.928202 543705 disk_info.go:196] parse disk info done, disk is : [0xc000492fc0 0xc000493000]
E0320 10:51:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:51:33.409781 543705 memory.go:184] no items to output this cycle
I0320 10:51:33.409796 543705 cpu.go:275] no items to output this cycle
I0320 10:51:38.437958 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 10:51:38.437966 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 10:51:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:51:43.410723 543705 memory.go:191] Add success.
I0320 10:51:43.409834 543705 cpu.go:282] Add success.
I0320 10:51:43.420437 543705 net.go:648] Add success.
I0320 10:51:43.423090 543705 net.go:770] primary dev: ETH0
I0320 10:51:43.423103 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:51:43.423116 543705 net.go:698] Add success.
I0320 10:51:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:51:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:51:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:51:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:51:53.409768 543705 memory.go:184] no items to output this cycle
I0320 10:51:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 10:52:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:52:03.409898 543705 memory.go:184] no items to output this cycle
I0320 10:52:03.409977 543705 cpu.go:275] no items to output this cycle
E0320 10:52:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:52:13.409781 543705 memory.go:191] Add success.
I0320 10:52:13.409805 543705 cpu.go:282] Add success.
W0320 10:52:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:52:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:52:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:52:13.420104 543705 net.go:648] Add success.
I0320 10:52:13.422581 543705 net.go:770] primary dev: ETH0
I0320 10:52:13.422596 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:52:13.422610 543705 net.go:698] Add success.
W0320 10:52:14.455108 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:52:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 10:52:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:52:14.456764 543705 disk_worker.go:494] system disk:vda1
I0320 10:52:14.456805 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 10:52:14.457105 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 10:52:14.457112 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 10:52:14.457117 543705 custom_config.go:64] query custom config with name: gpu
E0320 10:52:15.456831 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 10:52:15.456841 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:52:16.457962 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 10:52:16.457963 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 10:52:16.458016 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:52:16.458034 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:52:16.472375 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:52:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:52:23.409785 543705 memory.go:184] no items to output this cycle
I0320 10:52:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 10:52:23.929672 543705 disk_info.go:125] begin check local disk info of client
I0320 10:52:23.932211 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:52:23.932216 543705 disk_info.go:196] parse disk info done, disk is : [0xc000393e00 0xc000393e40]
E0320 10:52:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:52:33.409774 543705 memory.go:184] no items to output this cycle
I0320 10:52:33.409800 543705 cpu.go:275] no items to output this cycle
E0320 10:52:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:52:43.409785 543705 memory.go:191] Add success.
I0320 10:52:43.409806 543705 cpu.go:282] Add success.
I0320 10:52:43.420019 543705 net.go:648] Add success.
I0320 10:52:43.422667 543705 net.go:770] primary dev: ETH0
I0320 10:52:43.422680 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:52:43.422693 543705 net.go:698] Add success.
I0320 10:52:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:52:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:52:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:52:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:52:53.409777 543705 memory.go:184] no items to output this cycle
I0320 10:52:53.409779 543705 cpu.go:275] no items to output this cycle
E0320 10:53:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:53:03.409794 543705 memory.go:184] no items to output this cycle
I0320 10:53:03.409807 543705 cpu.go:275] no items to output this cycle
I0320 10:53:13.409897 543705 cpu.go:282] Add success.
E0320 10:53:13.409959 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:53:13.409985 543705 memory.go:191] Add success.
W0320 10:53:13.410019 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:53:13.410040 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:53:13.410044 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:53:13.419717 543705 net.go:648] Add success.
I0320 10:53:13.422716 543705 net.go:770] primary dev: ETH0
I0320 10:53:13.422730 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:53:13.422744 543705 net.go:698] Add success.
I0320 10:53:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:53:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:53:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 10:53:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:53:14.456576 543705 disk_worker.go:494] system disk:vda1
I0320 10:53:14.456605 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:53:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:53:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:53:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:53:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:53:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:53:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:53:23.409777 543705 memory.go:184] no items to output this cycle
I0320 10:53:23.409812 543705 cpu.go:275] no items to output this cycle
I0320 10:53:23.933670 543705 disk_info.go:125] begin check local disk info of client
I0320 10:53:23.936251 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:53:23.936256 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001abdc0 0xc0001abe00]
E0320 10:53:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:53:33.409805 543705 memory.go:184] no items to output this cycle
I0320 10:53:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 10:53:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:53:43.409796 543705 memory.go:191] Add success.
I0320 10:53:43.409811 543705 cpu.go:282] Add success.
I0320 10:53:43.419946 543705 net.go:648] Add success.
I0320 10:53:43.422925 543705 net.go:770] primary dev: ETH0
I0320 10:53:43.422945 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:53:43.422963 543705 net.go:698] Add success.
I0320 10:53:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:53:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:53:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:53:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:53:53.409769 543705 memory.go:184] no items to output this cycle
I0320 10:53:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 10:54:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:54:03.409803 543705 memory.go:184] no items to output this cycle
I0320 10:54:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 10:54:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:54:13.409783 543705 memory.go:191] Add success.
I0320 10:54:13.409804 543705 cpu.go:282] Add success.
W0320 10:54:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:54:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:54:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:54:13.420245 543705 net.go:648] Add success.
I0320 10:54:13.423360 543705 net.go:770] primary dev: ETH0
I0320 10:54:13.423386 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:54:13.423400 543705 net.go:698] Add success.
I0320 10:54:13.508570 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d54d6228-74a9-4956-8d84-cf55a17173c3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 10:54:13.508610 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 10:54:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:54:14.455160 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:54:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0320 10:54:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:54:14.456678 543705 disk_worker.go:494] system disk:vda1
I0320 10:54:14.456707 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:54:15.455608 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:54:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:54:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:54:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:54:16.472386 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:54:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:54:23.409780 543705 memory.go:184] no items to output this cycle
I0320 10:54:23.409795 543705 cpu.go:275] no items to output this cycle
I0320 10:54:23.937671 543705 disk_info.go:125] begin check local disk info of client
I0320 10:54:23.940175 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:54:23.940181 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001abc40 0xc0001abc80]
E0320 10:54:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:54:33.409806 543705 memory.go:184] no items to output this cycle
I0320 10:54:33.409819 543705 cpu.go:275] no items to output this cycle
I0320 10:54:38.438935 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 10:54:38.438942 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 10:54:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:54:43.410724 543705 memory.go:191] Add success.
I0320 10:54:43.409796 543705 cpu.go:282] Add success.
I0320 10:54:43.420430 543705 net.go:648] Add success.
I0320 10:54:43.423111 543705 net.go:770] primary dev: ETH0
I0320 10:54:43.423127 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:54:43.423142 543705 net.go:698] Add success.
I0320 10:54:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:54:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:54:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:54:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:54:53.409778 543705 memory.go:184] no items to output this cycle
I0320 10:54:53.409780 543705 cpu.go:275] no items to output this cycle
E0320 10:55:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:55:03.409792 543705 memory.go:184] no items to output this cycle
I0320 10:55:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 10:55:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:55:13.409791 543705 memory.go:191] Add success.
I0320 10:55:13.409796 543705 cpu.go:282] Add success.
W0320 10:55:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:55:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:55:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:55:13.420047 543705 net.go:648] Add success.
I0320 10:55:13.423016 543705 net.go:770] primary dev: ETH0
I0320 10:55:13.423030 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:55:13.423041 543705 net.go:698] Add success.
I0320 10:55:14.454955 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:55:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:55:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 10:55:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:55:14.456587 543705 disk_worker.go:494] system disk:vda1
I0320 10:55:14.456615 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:55:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:55:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:55:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:55:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:55:16.472408 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:55:23.410342 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:55:23.410361 543705 memory.go:184] no items to output this cycle
I0320 10:55:23.410378 543705 cpu.go:275] no items to output this cycle
I0320 10:55:23.941671 543705 disk_info.go:125] begin check local disk info of client
I0320 10:55:23.944188 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:55:23.944195 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb500 0xc0001fb540]
E0320 10:55:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:55:33.409773 543705 memory.go:184] no items to output this cycle
I0320 10:55:33.409794 543705 cpu.go:275] no items to output this cycle
E0320 10:55:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:55:43.409793 543705 memory.go:191] Add success.
I0320 10:55:43.409817 543705 cpu.go:282] Add success.
I0320 10:55:43.419917 543705 net.go:648] Add success.
I0320 10:55:43.422429 543705 net.go:770] primary dev: ETH0
I0320 10:55:43.422448 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:55:43.422463 543705 net.go:698] Add success.
I0320 10:55:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:55:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:55:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:55:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:55:53.409778 543705 memory.go:184] no items to output this cycle
I0320 10:55:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 10:56:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:56:03.409796 543705 memory.go:184] no items to output this cycle
I0320 10:56:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 10:56:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:56:13.409779 543705 memory.go:191] Add success.
W0320 10:56:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 10:56:13.409805 543705 cpu.go:282] Add success.
W0320 10:56:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:56:13.409818 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:56:13.420505 543705 net.go:648] Add success.
I0320 10:56:13.423560 543705 net.go:770] primary dev: ETH0
I0320 10:56:13.423574 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:56:13.423586 543705 net.go:698] Add success.
I0320 10:56:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:56:14.455141 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:56:14.455150 543705 disk_worker.go:708] disk space is not compliant
W0320 10:56:14.455153 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:56:14.456481 543705 disk_worker.go:494] system disk:vda1
I0320 10:56:14.456520 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:56:15.456006 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:56:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:56:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:56:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:56:16.472386 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:56:23.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:56:23.409812 543705 memory.go:184] no items to output this cycle
I0320 10:56:23.409822 543705 cpu.go:275] no items to output this cycle
I0320 10:56:23.945669 543705 disk_info.go:125] begin check local disk info of client
I0320 10:56:23.948196 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:56:23.948201 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329c40 0xc000329c80]
E0320 10:56:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:56:33.409806 543705 memory.go:184] no items to output this cycle
I0320 10:56:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 10:56:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:56:43.409823 543705 memory.go:191] Add success.
I0320 10:56:43.409834 543705 cpu.go:282] Add success.
I0320 10:56:43.420014 543705 net.go:648] Add success.
I0320 10:56:43.422826 543705 net.go:770] primary dev: ETH0
I0320 10:56:43.422841 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:56:43.422856 543705 net.go:698] Add success.
I0320 10:56:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:56:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:56:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:56:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:56:53.409769 543705 memory.go:184] no items to output this cycle
I0320 10:56:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 10:57:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:57:03.409777 543705 memory.go:184] no items to output this cycle
I0320 10:57:03.409782 543705 cpu.go:275] no items to output this cycle
E0320 10:57:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:57:13.409810 543705 memory.go:191] Add success.
I0320 10:57:13.409818 543705 cpu.go:282] Add success.
W0320 10:57:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:57:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:57:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:57:13.420132 543705 net.go:648] Add success.
I0320 10:57:13.423017 543705 net.go:770] primary dev: ETH0
I0320 10:57:13.423031 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:57:13.423043 543705 net.go:698] Add success.
I0320 10:57:13.429345 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 10:57:13.453527 543705 event_worker.go:152] Polling the log file for events...
I0320 10:57:13.468376 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"dd329215-d506-4bd5-b97d-3ddf29e4c898","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 10:57:13.468415 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 10:57:14.455158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:57:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0320 10:57:14.455172 543705 disk_worker.go:728] disk inode is not compliant
E0320 10:57:14.456161 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 10:57:14.456171 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 10:57:14.456177 543705 custom_config.go:64] query custom config with name: gpu
I0320 10:57:14.456456 543705 disk_worker.go:494] system disk:vda1
I0320 10:57:14.456653 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 10:57:15.456833 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 10:57:15.456842 543705 custom_config.go:64] query custom config with name: huawei_npu
E0320 10:57:16.457948 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 10:57:16.457950 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:57:16.458004 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:57:16.458023 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:57:16.472348 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:57:23.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:57:23.409815 543705 memory.go:184] no items to output this cycle
I0320 10:57:23.409830 543705 cpu.go:275] no items to output this cycle
I0320 10:57:23.949677 543705 disk_info.go:125] begin check local disk info of client
I0320 10:57:23.952242 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:57:23.952248 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4e80 0xc0000c4ec0]
E0320 10:57:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:57:33.409785 543705 memory.go:184] no items to output this cycle
I0320 10:57:33.409810 543705 cpu.go:275] no items to output this cycle
I0320 10:57:38.439989 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 10:57:38.439997 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 10:57:43.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:57:43.410819 543705 memory.go:191] Add success.
I0320 10:57:43.409842 543705 cpu.go:282] Add success.
I0320 10:57:43.420477 543705 net.go:648] Add success.
I0320 10:57:43.423081 543705 net.go:770] primary dev: ETH0
I0320 10:57:43.423093 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:57:43.423105 543705 net.go:698] Add success.
I0320 10:57:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:57:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:57:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:57:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:57:53.409791 543705 cpu.go:275] no items to output this cycle
I0320 10:57:53.409794 543705 memory.go:184] no items to output this cycle
E0320 10:58:03.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:58:03.409800 543705 cpu.go:275] no items to output this cycle
I0320 10:58:03.409809 543705 memory.go:184] no items to output this cycle
E0320 10:58:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:58:13.409817 543705 memory.go:191] Add success.
I0320 10:58:13.409827 543705 cpu.go:282] Add success.
W0320 10:58:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 10:58:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:58:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:58:13.420125 543705 net.go:648] Add success.
I0320 10:58:13.422952 543705 net.go:770] primary dev: ETH0
I0320 10:58:13.422966 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:58:13.422977 543705 net.go:698] Add success.
I0320 10:58:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:58:14.455174 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:58:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 10:58:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:58:14.456564 543705 disk_worker.go:494] system disk:vda1
I0320 10:58:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:58:15.456012 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:58:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:58:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:58:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:58:16.472399 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:58:23.410249 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:58:23.410269 543705 memory.go:184] no items to output this cycle
I0320 10:58:23.410285 543705 cpu.go:275] no items to output this cycle
I0320 10:58:23.953672 543705 disk_info.go:125] begin check local disk info of client
I0320 10:58:23.956280 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:58:23.956286 543705 disk_info.go:196] parse disk info done, disk is : [0xc000490a40 0xc000490a80]
E0320 10:58:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:58:33.409793 543705 memory.go:184] no items to output this cycle
I0320 10:58:33.409800 543705 cpu.go:275] no items to output this cycle
E0320 10:58:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:58:43.409802 543705 memory.go:191] Add success.
I0320 10:58:43.409802 543705 cpu.go:282] Add success.
I0320 10:58:43.419894 543705 net.go:648] Add success.
I0320 10:58:43.422699 543705 net.go:770] primary dev: ETH0
I0320 10:58:43.422711 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:58:43.422723 543705 net.go:698] Add success.
I0320 10:58:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:58:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:58:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:58:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:58:53.409779 543705 memory.go:184] no items to output this cycle
I0320 10:58:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 10:59:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:59:03.409804 543705 memory.go:184] no items to output this cycle
I0320 10:59:03.409815 543705 cpu.go:275] no items to output this cycle
E0320 10:59:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:59:13.409775 543705 memory.go:191] Add success.
W0320 10:59:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 10:59:13.409819 543705 cpu.go:282] Add success.
W0320 10:59:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 10:59:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 10:59:13.420076 543705 net.go:648] Add success.
I0320 10:59:13.422741 543705 net.go:770] primary dev: ETH0
I0320 10:59:13.422753 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:59:13.422767 543705 net.go:698] Add success.
I0320 10:59:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 10:59:14.455183 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 10:59:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 10:59:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0320 10:59:14.456607 543705 disk_worker.go:494] system disk:vda1
I0320 10:59:14.456641 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 10:59:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 10:59:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:59:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:59:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 10:59:16.472370 543705 disk_local_worker.go:436] Get disk info: []
E0320 10:59:23.409858 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:59:23.409884 543705 memory.go:184] no items to output this cycle
I0320 10:59:23.409956 543705 cpu.go:275] no items to output this cycle
I0320 10:59:23.957691 543705 disk_info.go:125] begin check local disk info of client
I0320 10:59:23.960103 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 10:59:23.960109 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034af00 0xc00034af40]
E0320 10:59:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:59:33.409779 543705 memory.go:184] no items to output this cycle
I0320 10:59:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 10:59:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:59:43.409823 543705 memory.go:191] Add success.
I0320 10:59:43.409825 543705 cpu.go:282] Add success.
I0320 10:59:43.419994 543705 net.go:648] Add success.
I0320 10:59:43.422824 543705 net.go:770] primary dev: ETH0
I0320 10:59:43.422845 543705 net.go:802] Send network stats successfully!,count is 6
I0320 10:59:43.422858 543705 net.go:698] Add success.
I0320 10:59:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 10:59:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 10:59:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 10:59:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 10:59:53.409779 543705 memory.go:184] no items to output this cycle
I0320 10:59:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 11:00:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:00:03.409771 543705 memory.go:184] no items to output this cycle
I0320 11:00:03.409790 543705 cpu.go:275] no items to output this cycle
E0320 11:00:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:00:13.409793 543705 cpu.go:282] Add success.
I0320 11:00:13.409801 543705 memory.go:191] Add success.
W0320 11:00:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:00:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:00:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:00:13.419834 543705 net.go:770] primary dev: ETH0
I0320 11:00:13.419846 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:00:13.419858 543705 net.go:698] Add success.
I0320 11:00:13.420208 543705 net.go:648] Add success.
I0320 11:00:13.477260 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"bda8dfe1-ce3b-4e25-ad68-dfd01a1f5b74","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 11:00:13.477295 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 11:00:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:00:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:00:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 11:00:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:00:14.456470 543705 disk_worker.go:494] system disk:vda1
I0320 11:00:14.456514 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:00:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:00:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:00:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:00:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:00:16.472407 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:00:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:00:23.409774 543705 memory.go:184] no items to output this cycle
I0320 11:00:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 11:00:23.960201 543705 disk_info.go:125] begin check local disk info of client
I0320 11:00:23.962707 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:00:23.962712 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004e0900 0xc0004e0940]
E0320 11:00:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:00:33.409777 543705 memory.go:184] no items to output this cycle
I0320 11:00:33.409796 543705 cpu.go:275] no items to output this cycle
I0320 11:00:38.440978 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 11:00:38.440985 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 11:00:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:00:43.410662 543705 memory.go:191] Add success.
I0320 11:00:43.409827 543705 cpu.go:282] Add success.
I0320 11:00:43.420366 543705 net.go:648] Add success.
I0320 11:00:43.423420 543705 net.go:770] primary dev: ETH0
I0320 11:00:43.423435 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:00:43.423450 543705 net.go:698] Add success.
I0320 11:00:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:00:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:00:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:00:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:00:53.409793 543705 memory.go:184] no items to output this cycle
I0320 11:00:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 11:01:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:01:03.409766 543705 memory.go:184] no items to output this cycle
I0320 11:01:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 11:01:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:01:13.409806 543705 memory.go:191] Add success.
I0320 11:01:13.409818 543705 cpu.go:282] Add success.
W0320 11:01:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:01:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:01:13.409856 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:01:13.420079 543705 net.go:648] Add success.
I0320 11:01:13.423154 543705 net.go:770] primary dev: ETH0
I0320 11:01:13.423169 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:01:13.423184 543705 net.go:698] Add success.
I0320 11:01:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:01:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:01:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 11:01:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:01:14.456569 543705 disk_worker.go:494] system disk:vda1
I0320 11:01:14.456598 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:01:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:01:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:01:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:01:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:01:16.472387 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:01:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:01:23.409803 543705 memory.go:184] no items to output this cycle
I0320 11:01:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 11:01:23.964939 543705 disk_info.go:125] begin check local disk info of client
I0320 11:01:23.967408 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:01:23.967415 543705 disk_info.go:196] parse disk info done, disk is : [0xc000490000 0xc000490040]
E0320 11:01:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:01:33.409768 543705 memory.go:184] no items to output this cycle
I0320 11:01:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 11:01:43.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:01:43.409831 543705 memory.go:191] Add success.
I0320 11:01:43.409836 543705 cpu.go:282] Add success.
I0320 11:01:43.420085 543705 net.go:648] Add success.
I0320 11:01:43.423051 543705 net.go:770] primary dev: ETH0
I0320 11:01:43.423064 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:01:43.423076 543705 net.go:698] Add success.
I0320 11:01:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:01:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:01:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:01:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:01:53.409780 543705 memory.go:184] no items to output this cycle
I0320 11:01:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 11:02:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:02:03.409774 543705 memory.go:184] no items to output this cycle
I0320 11:02:03.409779 543705 cpu.go:275] no items to output this cycle
E0320 11:02:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:02:13.409775 543705 memory.go:191] Add success.
W0320 11:02:13.409801 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:02:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:02:13.409811 543705 cpu.go:282] Add success.
I0320 11:02:13.409815 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:02:13.420080 543705 net.go:648] Add success.
I0320 11:02:13.422873 543705 net.go:770] primary dev: ETH0
I0320 11:02:13.422886 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:02:13.422897 543705 net.go:698] Add success.
W0320 11:02:14.455136 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:02:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 11:02:14.455202 543705 disk_worker.go:728] disk inode is not compliant
E0320 11:02:14.455912 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 11:02:14.455921 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 11:02:14.455928 543705 custom_config.go:64] query custom config with name: gpu
I0320 11:02:14.456564 543705 disk_worker.go:494] system disk:vda1
I0320 11:02:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 11:02:15.456860 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 11:02:15.456869 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:02:16.457928 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 11:02:16.457927 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 11:02:16.457981 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:02:16.458000 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:02:16.472381 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:02:23.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:02:23.409819 543705 memory.go:184] no items to output this cycle
I0320 11:02:23.409822 543705 cpu.go:275] no items to output this cycle
I0320 11:02:23.969671 543705 disk_info.go:125] begin check local disk info of client
I0320 11:02:23.972170 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:02:23.972175 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003406c0 0xc000340700]
E0320 11:02:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:02:33.409769 543705 memory.go:184] no items to output this cycle
I0320 11:02:33.409794 543705 cpu.go:275] no items to output this cycle
E0320 11:02:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:02:43.409793 543705 memory.go:191] Add success.
I0320 11:02:43.409799 543705 cpu.go:282] Add success.
I0320 11:02:43.420107 543705 net.go:648] Add success.
I0320 11:02:43.422868 543705 net.go:770] primary dev: ETH0
I0320 11:02:43.422881 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:02:43.422892 543705 net.go:698] Add success.
I0320 11:02:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:02:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:02:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:02:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:02:53.409772 543705 memory.go:184] no items to output this cycle
I0320 11:02:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 11:03:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:03:03.409776 543705 memory.go:184] no items to output this cycle
I0320 11:03:03.409778 543705 cpu.go:275] no items to output this cycle
E0320 11:03:13.410393 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:03:13.410426 543705 memory.go:191] Add success.
I0320 11:03:13.410434 543705 cpu.go:282] Add success.
W0320 11:03:13.410458 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:03:13.410474 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:03:13.410478 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:03:13.420707 543705 net.go:648] Add success.
I0320 11:03:13.423651 543705 net.go:770] primary dev: ETH0
I0320 11:03:13.423665 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:03:13.423676 543705 net.go:698] Add success.
I0320 11:03:13.469297 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"56c1b8fc-5886-4b40-bfbb-8bb470ad132d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 11:03:13.469332 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 11:03:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:03:14.455216 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:03:14.455226 543705 disk_worker.go:708] disk space is not compliant
W0320 11:03:14.455229 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:03:14.456776 543705 disk_worker.go:494] system disk:vda1
I0320 11:03:14.456805 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:03:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:03:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:03:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:03:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:03:16.472380 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:03:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:03:23.409785 543705 memory.go:184] no items to output this cycle
I0320 11:03:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 11:03:23.973674 543705 disk_info.go:125] begin check local disk info of client
I0320 11:03:23.976199 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:03:23.976204 543705 disk_info.go:196] parse disk info done, disk is : [0xc000340080 0xc0003400c0]
E0320 11:03:33.410575 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:03:33.410592 543705 memory.go:184] no items to output this cycle
I0320 11:03:33.410632 543705 cpu.go:275] no items to output this cycle
I0320 11:03:38.441966 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 11:03:38.441974 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 11:03:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:03:43.410666 543705 memory.go:191] Add success.
I0320 11:03:43.409810 543705 cpu.go:282] Add success.
I0320 11:03:43.420191 543705 net.go:770] primary dev: ETH0
I0320 11:03:43.420206 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:03:43.420219 543705 net.go:698] Add success.
I0320 11:03:43.420760 543705 net.go:648] Add success.
I0320 11:03:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:03:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:03:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:03:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:03:53.409802 543705 memory.go:184] no items to output this cycle
I0320 11:03:53.409815 543705 cpu.go:275] no items to output this cycle
E0320 11:04:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:04:03.409778 543705 memory.go:184] no items to output this cycle
I0320 11:04:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 11:04:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:04:13.409782 543705 memory.go:191] Add success.
W0320 11:04:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 11:04:13.409808 543705 cpu.go:282] Add success.
W0320 11:04:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:04:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:04:13.420046 543705 net.go:648] Add success.
I0320 11:04:13.422589 543705 net.go:770] primary dev: ETH0
I0320 11:04:13.422601 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:04:13.422613 543705 net.go:698] Add success.
I0320 11:04:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:04:14.455098 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:04:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0320 11:04:14.455160 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:04:14.456504 543705 disk_worker.go:494] system disk:vda1
I0320 11:04:14.456546 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:04:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:04:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:04:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:04:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:04:16.472478 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:04:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:04:23.409777 543705 memory.go:184] no items to output this cycle
I0320 11:04:23.409800 543705 cpu.go:275] no items to output this cycle
I0320 11:04:23.977670 543705 disk_info.go:125] begin check local disk info of client
I0320 11:04:23.980202 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:04:23.980207 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039ac00 0xc00039ac40]
E0320 11:04:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:04:33.409772 543705 memory.go:184] no items to output this cycle
I0320 11:04:33.409791 543705 cpu.go:275] no items to output this cycle
E0320 11:04:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:04:43.409809 543705 memory.go:191] Add success.
I0320 11:04:43.409820 543705 cpu.go:282] Add success.
I0320 11:04:43.419880 543705 net.go:648] Add success.
I0320 11:04:43.422623 543705 net.go:770] primary dev: ETH0
I0320 11:04:43.422636 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:04:43.422650 543705 net.go:698] Add success.
I0320 11:04:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:04:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:04:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:04:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:04:53.409767 543705 memory.go:184] no items to output this cycle
I0320 11:04:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 11:05:03.409879 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:05:03.409980 543705 memory.go:184] no items to output this cycle
I0320 11:05:03.410012 543705 cpu.go:275] no items to output this cycle
E0320 11:05:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:05:13.409811 543705 memory.go:191] Add success.
I0320 11:05:13.409817 543705 cpu.go:282] Add success.
W0320 11:05:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:05:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:05:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:05:13.420155 543705 net.go:648] Add success.
I0320 11:05:13.422659 543705 net.go:770] primary dev: ETH0
I0320 11:05:13.422672 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:05:13.422684 543705 net.go:698] Add success.
I0320 11:05:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:05:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:05:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0320 11:05:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:05:14.456492 543705 disk_worker.go:494] system disk:vda1
I0320 11:05:14.456534 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:05:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:05:16.457995 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:05:16.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:05:16.458080 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:05:16.472420 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:05:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:05:23.409806 543705 memory.go:184] no items to output this cycle
I0320 11:05:23.409816 543705 cpu.go:275] no items to output this cycle
I0320 11:05:23.981669 543705 disk_info.go:125] begin check local disk info of client
I0320 11:05:23.984231 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:05:23.984236 543705 disk_info.go:196] parse disk info done, disk is : [0xc000471e00 0xc000471e40]
E0320 11:05:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:05:33.409799 543705 memory.go:184] no items to output this cycle
I0320 11:05:33.409814 543705 cpu.go:275] no items to output this cycle
E0320 11:05:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:05:43.409792 543705 memory.go:191] Add success.
I0320 11:05:43.409812 543705 cpu.go:282] Add success.
I0320 11:05:43.420010 543705 net.go:648] Add success.
I0320 11:05:43.423138 543705 net.go:770] primary dev: ETH0
I0320 11:05:43.423151 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:05:43.423163 543705 net.go:698] Add success.
I0320 11:05:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:05:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:05:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:05:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:05:53.409797 543705 memory.go:184] no items to output this cycle
I0320 11:05:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 11:06:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:06:03.409780 543705 memory.go:184] no items to output this cycle
I0320 11:06:03.409781 543705 cpu.go:275] no items to output this cycle
E0320 11:06:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:06:13.409787 543705 memory.go:191] Add success.
I0320 11:06:13.409787 543705 cpu.go:282] Add success.
W0320 11:06:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:06:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:06:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:06:13.419739 543705 net.go:648] Add success.
I0320 11:06:13.422893 543705 net.go:770] primary dev: ETH0
I0320 11:06:13.422906 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:06:13.422917 543705 net.go:698] Add success.
I0320 11:06:13.468761 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b3c055d4-54a8-4948-bc86-de93ae891588","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 11:06:13.468792 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 11:06:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:06:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:06:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0320 11:06:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:06:14.456603 543705 disk_worker.go:494] system disk:vda1
I0320 11:06:14.456632 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:06:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:06:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:06:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:06:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:06:16.472393 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:06:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:06:23.409776 543705 memory.go:184] no items to output this cycle
I0320 11:06:23.409808 543705 cpu.go:275] no items to output this cycle
I0320 11:06:23.985672 543705 disk_info.go:125] begin check local disk info of client
I0320 11:06:23.988205 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:06:23.988211 543705 disk_info.go:196] parse disk info done, disk is : [0xc000470540 0xc000470580]
E0320 11:06:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:06:33.409775 543705 memory.go:184] no items to output this cycle
I0320 11:06:33.409793 543705 cpu.go:275] no items to output this cycle
I0320 11:06:38.442971 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 11:06:38.442978 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 11:06:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:06:43.410592 543705 memory.go:191] Add success.
I0320 11:06:43.409802 543705 cpu.go:282] Add success.
I0320 11:06:43.420299 543705 net.go:648] Add success.
I0320 11:06:43.422769 543705 net.go:770] primary dev: ETH0
I0320 11:06:43.422782 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:06:43.422796 543705 net.go:698] Add success.
I0320 11:06:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:06:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:06:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:06:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:06:53.409782 543705 memory.go:184] no items to output this cycle
I0320 11:06:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 11:07:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:07:03.409780 543705 memory.go:184] no items to output this cycle
I0320 11:07:03.409780 543705 cpu.go:275] no items to output this cycle
E0320 11:07:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:07:13.409785 543705 memory.go:191] Add success.
I0320 11:07:13.409796 543705 cpu.go:282] Add success.
W0320 11:07:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:07:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:07:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:07:13.419807 543705 net.go:648] Add success.
I0320 11:07:13.422824 543705 net.go:770] primary dev: ETH0
I0320 11:07:13.422837 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:07:13.422849 543705 net.go:698] Add success.
I0320 11:07:13.453535 543705 event_worker.go:152] Polling the log file for events...
W0320 11:07:14.455151 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:07:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0320 11:07:14.455163 543705 disk_worker.go:728] disk inode is not compliant
E0320 11:07:14.456965 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 11:07:14.456974 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 11:07:14.456980 543705 custom_config.go:64] query custom config with name: gpu
I0320 11:07:14.457018 543705 disk_worker.go:494] system disk:vda1
I0320 11:07:14.457048 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 11:07:15.456808 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 11:07:15.456816 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:07:16.458022 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 11:07:16.458021 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 11:07:16.458087 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:07:16.458111 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:07:16.472518 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:07:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:07:23.409787 543705 memory.go:184] no items to output this cycle
I0320 11:07:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 11:07:23.989671 543705 disk_info.go:125] begin check local disk info of client
I0320 11:07:23.992245 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:07:23.992251 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4740 0xc0000c4780]
E0320 11:07:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:07:33.409775 543705 memory.go:184] no items to output this cycle
I0320 11:07:33.409786 543705 cpu.go:275] no items to output this cycle
E0320 11:07:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:07:43.409801 543705 memory.go:191] Add success.
I0320 11:07:43.409802 543705 cpu.go:282] Add success.
I0320 11:07:43.419894 543705 net.go:648] Add success.
I0320 11:07:43.422868 543705 net.go:770] primary dev: ETH0
I0320 11:07:43.422881 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:07:43.422894 543705 net.go:698] Add success.
I0320 11:07:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:07:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:07:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:07:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:07:53.409798 543705 memory.go:184] no items to output this cycle
I0320 11:07:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 11:08:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:08:03.409777 543705 memory.go:184] no items to output this cycle
I0320 11:08:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 11:08:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:08:13.409815 543705 memory.go:191] Add success.
I0320 11:08:13.409819 543705 cpu.go:282] Add success.
W0320 11:08:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:08:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:08:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:08:13.420250 543705 net.go:648] Add success.
I0320 11:08:13.423055 543705 net.go:770] primary dev: ETH0
I0320 11:08:13.423068 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:08:13.423079 543705 net.go:698] Add success.
I0320 11:08:14.454008 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:08:14.454227 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:08:14.454237 543705 disk_worker.go:708] disk space is not compliant
W0320 11:08:14.454240 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:08:14.455619 543705 disk_worker.go:494] system disk:vda1
I0320 11:08:14.455647 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:08:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:08:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:08:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:08:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:08:16.472424 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:08:23.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:08:23.409823 543705 memory.go:184] no items to output this cycle
I0320 11:08:23.409826 543705 cpu.go:275] no items to output this cycle
I0320 11:08:23.993669 543705 disk_info.go:125] begin check local disk info of client
I0320 11:08:23.996229 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:08:23.996234 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047b100 0xc00047b140]
E0320 11:08:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:08:33.409778 543705 memory.go:184] no items to output this cycle
I0320 11:08:33.409783 543705 cpu.go:275] no items to output this cycle
E0320 11:08:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:08:43.409799 543705 memory.go:191] Add success.
I0320 11:08:43.409802 543705 cpu.go:282] Add success.
I0320 11:08:43.419952 543705 net.go:648] Add success.
I0320 11:08:43.423047 543705 net.go:770] primary dev: ETH0
I0320 11:08:43.423060 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:08:43.423073 543705 net.go:698] Add success.
I0320 11:08:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:08:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:08:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:08:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:08:53.409775 543705 memory.go:184] no items to output this cycle
I0320 11:08:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 11:09:03.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:09:03.409761 543705 memory.go:184] no items to output this cycle
I0320 11:09:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 11:09:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:09:13.409809 543705 memory.go:191] Add success.
I0320 11:09:13.409821 543705 cpu.go:282] Add success.
W0320 11:09:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:09:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:09:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:09:13.420067 543705 net.go:648] Add success.
I0320 11:09:13.423511 543705 net.go:770] primary dev: ETH0
I0320 11:09:13.423597 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:09:13.423619 543705 net.go:698] Add success.
I0320 11:09:13.470178 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"32ef6a57-4b76-4444-9507-0dd29c5e3ac2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 11:09:13.470208 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 11:09:14.454981 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:09:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:09:14.455250 543705 disk_worker.go:708] disk space is not compliant
W0320 11:09:14.455255 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:09:14.457008 543705 disk_worker.go:494] system disk:vda1
I0320 11:09:14.457038 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:09:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:09:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:09:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:09:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:09:16.472412 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:09:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:09:23.409808 543705 memory.go:184] no items to output this cycle
I0320 11:09:23.409817 543705 cpu.go:275] no items to output this cycle
I0320 11:09:23.997672 543705 disk_info.go:125] begin check local disk info of client
I0320 11:09:24.000207 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:09:24.000212 543705 disk_info.go:196] parse disk info done, disk is : [0xc000547880 0xc0005478c0]
E0320 11:09:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:09:33.409769 543705 memory.go:184] no items to output this cycle
I0320 11:09:33.409799 543705 cpu.go:275] no items to output this cycle
I0320 11:09:38.443973 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 11:09:38.443980 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 11:09:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:09:43.410737 543705 memory.go:191] Add success.
I0320 11:09:43.409825 543705 cpu.go:282] Add success.
I0320 11:09:43.420452 543705 net.go:648] Add success.
I0320 11:09:43.423431 543705 net.go:770] primary dev: ETH0
I0320 11:09:43.423446 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:09:43.423460 543705 net.go:698] Add success.
I0320 11:09:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:09:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:09:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:09:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:09:53.409786 543705 memory.go:184] no items to output this cycle
I0320 11:09:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 11:10:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:10:03.409775 543705 memory.go:184] no items to output this cycle
I0320 11:10:03.409780 543705 cpu.go:275] no items to output this cycle
E0320 11:10:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:10:13.409813 543705 cpu.go:282] Add success.
I0320 11:10:13.409822 543705 memory.go:191] Add success.
W0320 11:10:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:10:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:10:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:10:13.420222 543705 net.go:648] Add success.
I0320 11:10:13.423186 543705 net.go:770] primary dev: ETH0
I0320 11:10:13.423200 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:10:13.423215 543705 net.go:698] Add success.
I0320 11:10:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:10:14.455183 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:10:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0320 11:10:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:10:14.456585 543705 disk_worker.go:494] system disk:vda1
I0320 11:10:14.456615 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:10:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:10:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:10:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:10:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:10:16.472433 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:10:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:10:23.409805 543705 memory.go:184] no items to output this cycle
I0320 11:10:23.409824 543705 cpu.go:275] no items to output this cycle
I0320 11:10:24.001673 543705 disk_info.go:125] begin check local disk info of client
I0320 11:10:24.004191 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:10:24.004197 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bd040 0xc0002bd080]
E0320 11:10:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:10:33.409766 543705 memory.go:184] no items to output this cycle
I0320 11:10:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 11:10:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:10:43.409813 543705 memory.go:191] Add success.
I0320 11:10:43.409822 543705 cpu.go:282] Add success.
I0320 11:10:43.419994 543705 net.go:648] Add success.
I0320 11:10:43.422989 543705 net.go:770] primary dev: ETH0
I0320 11:10:43.423003 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:10:43.423016 543705 net.go:698] Add success.
I0320 11:10:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:10:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:10:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:10:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:10:53.409777 543705 memory.go:184] no items to output this cycle
I0320 11:10:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 11:11:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:11:03.409782 543705 memory.go:184] no items to output this cycle
I0320 11:11:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 11:11:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:11:13.409795 543705 memory.go:191] Add success.
I0320 11:11:13.409811 543705 cpu.go:282] Add success.
W0320 11:11:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:11:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:11:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:11:13.420303 543705 net.go:648] Add success.
I0320 11:11:13.422991 543705 net.go:770] primary dev: ETH0
I0320 11:11:13.423004 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:11:13.423016 543705 net.go:698] Add success.
I0320 11:11:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:11:14.455100 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:11:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0320 11:11:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:11:14.457621 543705 disk_worker.go:494] system disk:vda1
I0320 11:11:14.457676 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:11:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:11:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:11:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:11:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:11:16.472381 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:11:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:11:23.409782 543705 memory.go:184] no items to output this cycle
I0320 11:11:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 11:11:24.005669 543705 disk_info.go:125] begin check local disk info of client
I0320 11:11:24.008184 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:11:24.008189 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bd440 0xc0002bd480]
E0320 11:11:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:11:33.409806 543705 memory.go:184] no items to output this cycle
I0320 11:11:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 11:11:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:11:43.409810 543705 cpu.go:282] Add success.
I0320 11:11:43.409814 543705 memory.go:191] Add success.
I0320 11:11:43.420015 543705 net.go:648] Add success.
I0320 11:11:43.422849 543705 net.go:770] primary dev: ETH0
I0320 11:11:43.422863 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:11:43.422877 543705 net.go:698] Add success.
I0320 11:11:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:11:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:11:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:11:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:11:53.409794 543705 memory.go:184] no items to output this cycle
I0320 11:11:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 11:12:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:12:03.409780 543705 memory.go:184] no items to output this cycle
I0320 11:12:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 11:12:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:12:13.409800 543705 memory.go:191] Add success.
I0320 11:12:13.409816 543705 cpu.go:282] Add success.
W0320 11:12:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:12:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:12:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:12:13.420133 543705 net.go:648] Add success.
I0320 11:12:13.422843 543705 net.go:770] primary dev: ETH0
I0320 11:12:13.422856 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:12:13.422869 543705 net.go:698] Add success.
I0320 11:12:13.463721 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f37c6eaf-d07c-4078-a95a-59b479bc8b7b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 11:12:13.463757 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 11:12:14.455152 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:12:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0320 11:12:14.455164 543705 disk_worker.go:728] disk inode is not compliant
E0320 11:12:14.456919 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 11:12:14.456929 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 11:12:14.456934 543705 custom_config.go:64] query custom config with name: gpu
I0320 11:12:14.457108 543705 disk_worker.go:494] system disk:vda1
I0320 11:12:14.457150 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 11:12:15.456857 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 11:12:15.456865 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:12:16.457912 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 11:12:16.457912 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 11:12:16.457966 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:12:16.457987 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:12:16.472333 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:12:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:12:23.409795 543705 memory.go:184] no items to output this cycle
I0320 11:12:23.409815 543705 cpu.go:275] no items to output this cycle
I0320 11:12:24.009673 543705 disk_info.go:125] begin check local disk info of client
I0320 11:12:24.012208 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:12:24.012214 543705 disk_info.go:196] parse disk info done, disk is : [0xc000487cc0 0xc000487d00]
E0320 11:12:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:12:33.409805 543705 memory.go:184] no items to output this cycle
I0320 11:12:33.409822 543705 cpu.go:275] no items to output this cycle
I0320 11:12:38.444973 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 11:12:38.444980 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 11:12:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:12:43.410750 543705 memory.go:191] Add success.
I0320 11:12:43.409823 543705 cpu.go:282] Add success.
I0320 11:12:43.420425 543705 net.go:648] Add success.
I0320 11:12:43.423069 543705 net.go:770] primary dev: ETH0
I0320 11:12:43.423082 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:12:43.423094 543705 net.go:698] Add success.
I0320 11:12:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:12:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:12:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:12:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:12:53.409789 543705 memory.go:184] no items to output this cycle
I0320 11:12:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 11:13:03.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:13:03.409815 543705 memory.go:184] no items to output this cycle
I0320 11:13:03.409827 543705 cpu.go:275] no items to output this cycle
E0320 11:13:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:13:13.409796 543705 memory.go:191] Add success.
W0320 11:13:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 11:13:13.409823 543705 cpu.go:282] Add success.
W0320 11:13:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:13:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:13:13.420116 543705 net.go:648] Add success.
I0320 11:13:13.423089 543705 net.go:770] primary dev: ETH0
I0320 11:13:13.423105 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:13:13.423119 543705 net.go:698] Add success.
I0320 11:13:14.454978 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:13:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:13:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 11:13:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:13:14.456571 543705 disk_worker.go:494] system disk:vda1
I0320 11:13:14.456600 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:13:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:13:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:13:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:13:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:13:16.472389 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:13:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:13:23.409790 543705 memory.go:184] no items to output this cycle
I0320 11:13:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 11:13:24.013672 543705 disk_info.go:125] begin check local disk info of client
I0320 11:13:24.016197 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:13:24.016204 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003d5cc0 0xc0003d5d00]
E0320 11:13:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:13:33.409812 543705 memory.go:184] no items to output this cycle
I0320 11:13:33.409820 543705 cpu.go:275] no items to output this cycle
E0320 11:13:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:13:43.409793 543705 memory.go:191] Add success.
I0320 11:13:43.409820 543705 cpu.go:282] Add success.
I0320 11:13:43.419911 543705 net.go:648] Add success.
I0320 11:13:43.422629 543705 net.go:770] primary dev: ETH0
I0320 11:13:43.422643 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:13:43.422660 543705 net.go:698] Add success.
I0320 11:13:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:13:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:13:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:13:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:13:53.409779 543705 memory.go:184] no items to output this cycle
I0320 11:13:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 11:14:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:14:03.409767 543705 memory.go:184] no items to output this cycle
I0320 11:14:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 11:14:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:14:13.409818 543705 memory.go:191] Add success.
I0320 11:14:13.409825 543705 cpu.go:282] Add success.
W0320 11:14:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:14:13.409884 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:14:13.409888 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:14:13.420103 543705 net.go:648] Add success.
I0320 11:14:13.423162 543705 net.go:770] primary dev: ETH0
I0320 11:14:13.423175 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:14:13.423188 543705 net.go:698] Add success.
I0320 11:14:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:14:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:14:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0320 11:14:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:14:14.456476 543705 disk_worker.go:494] system disk:vda1
I0320 11:14:14.456521 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:14:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:14:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:14:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:14:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:14:16.472391 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:14:23.409904 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:14:23.409912 543705 cpu.go:275] no items to output this cycle
I0320 11:14:23.409959 543705 memory.go:184] no items to output this cycle
I0320 11:14:24.017669 543705 disk_info.go:125] begin check local disk info of client
I0320 11:14:24.020201 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:14:24.020207 543705 disk_info.go:196] parse disk info done, disk is : [0xc000254140 0xc000254180]
E0320 11:14:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:14:33.409775 543705 memory.go:184] no items to output this cycle
I0320 11:14:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 11:14:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:14:43.409784 543705 memory.go:191] Add success.
I0320 11:14:43.409803 543705 cpu.go:282] Add success.
I0320 11:14:43.419987 543705 net.go:648] Add success.
I0320 11:14:43.422811 543705 net.go:770] primary dev: ETH0
I0320 11:14:43.422823 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:14:43.422835 543705 net.go:698] Add success.
I0320 11:14:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:14:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:14:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:14:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:14:53.409783 543705 memory.go:184] no items to output this cycle
I0320 11:14:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 11:15:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:15:03.409770 543705 memory.go:184] no items to output this cycle
I0320 11:15:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 11:15:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:15:13.409791 543705 memory.go:191] Add success.
I0320 11:15:13.409814 543705 cpu.go:282] Add success.
W0320 11:15:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:15:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:15:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:15:13.420167 543705 net.go:648] Add success.
I0320 11:15:13.422900 543705 net.go:770] primary dev: ETH0
I0320 11:15:13.422914 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:15:13.422925 543705 net.go:698] Add success.
I0320 11:15:13.469482 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3b7d8aeb-2b6e-45e9-bba8-d6c7b9e3991d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 11:15:13.469515 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 11:15:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:15:14.455140 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:15:14.455216 543705 disk_worker.go:708] disk space is not compliant
W0320 11:15:14.455219 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:15:14.456612 543705 disk_worker.go:494] system disk:vda1
I0320 11:15:14.456643 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:15:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:15:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:15:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:15:16.458168 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:15:16.472120 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:15:23.410395 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:15:23.410413 543705 memory.go:184] no items to output this cycle
I0320 11:15:23.410420 543705 cpu.go:275] no items to output this cycle
I0320 11:15:24.021671 543705 disk_info.go:125] begin check local disk info of client
I0320 11:15:24.024273 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:15:24.024279 543705 disk_info.go:196] parse disk info done, disk is : [0xc000462fc0 0xc000463000]
E0320 11:15:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:15:33.409782 543705 memory.go:184] no items to output this cycle
I0320 11:15:33.409787 543705 cpu.go:275] no items to output this cycle
I0320 11:15:38.446003 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 11:15:38.446010 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 11:15:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:15:43.410704 543705 memory.go:191] Add success.
I0320 11:15:43.409823 543705 cpu.go:282] Add success.
I0320 11:15:43.420393 543705 net.go:648] Add success.
I0320 11:15:43.423331 543705 net.go:770] primary dev: ETH0
I0320 11:15:43.423344 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:15:43.423356 543705 net.go:698] Add success.
I0320 11:15:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:15:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:15:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:15:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:15:53.409787 543705 memory.go:184] no items to output this cycle
I0320 11:15:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 11:16:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:16:03.409771 543705 memory.go:184] no items to output this cycle
I0320 11:16:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 11:16:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:16:13.409813 543705 memory.go:191] Add success.
I0320 11:16:13.409825 543705 cpu.go:282] Add success.
W0320 11:16:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:16:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:16:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:16:13.420435 543705 net.go:648] Add success.
I0320 11:16:13.423324 543705 net.go:770] primary dev: ETH0
I0320 11:16:13.423336 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:16:13.423351 543705 net.go:698] Add success.
I0320 11:16:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:16:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:16:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0320 11:16:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:16:14.456578 543705 disk_worker.go:494] system disk:vda1
I0320 11:16:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:16:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:16:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:16:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:16:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:16:16.472432 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:16:23.410406 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:16:23.410425 543705 memory.go:184] no items to output this cycle
I0320 11:16:23.410442 543705 cpu.go:275] no items to output this cycle
I0320 11:16:24.025674 543705 disk_info.go:125] begin check local disk info of client
I0320 11:16:24.028224 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:16:24.028230 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fbe40 0xc0001fbe80]
E0320 11:16:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:16:33.409784 543705 memory.go:184] no items to output this cycle
I0320 11:16:33.409790 543705 cpu.go:275] no items to output this cycle
E0320 11:16:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:16:43.409817 543705 memory.go:191] Add success.
I0320 11:16:43.409819 543705 cpu.go:282] Add success.
I0320 11:16:43.419987 543705 net.go:648] Add success.
I0320 11:16:43.423088 543705 net.go:770] primary dev: ETH0
I0320 11:16:43.423103 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:16:43.423117 543705 net.go:698] Add success.
I0320 11:16:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:16:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:16:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:16:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:16:53.409799 543705 memory.go:184] no items to output this cycle
I0320 11:16:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 11:17:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:17:03.409773 543705 memory.go:184] no items to output this cycle
I0320 11:17:03.409782 543705 cpu.go:275] no items to output this cycle
E0320 11:17:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:17:13.409813 543705 memory.go:191] Add success.
I0320 11:17:13.409817 543705 cpu.go:282] Add success.
W0320 11:17:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:17:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:17:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:17:13.420250 543705 net.go:648] Add success.
I0320 11:17:13.422888 543705 net.go:770] primary dev: ETH0
I0320 11:17:13.422901 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:17:13.422914 543705 net.go:698] Add success.
I0320 11:17:13.453452 543705 event_worker.go:152] Polling the log file for events...
W0320 11:17:14.455464 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:17:14.455484 543705 disk_worker.go:708] disk space is not compliant
W0320 11:17:14.455489 543705 disk_worker.go:728] disk inode is not compliant
E0320 11:17:14.456548 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 11:17:14.456557 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 11:17:14.456564 543705 custom_config.go:64] query custom config with name: gpu
I0320 11:17:14.457495 543705 disk_worker.go:494] system disk:vda1
I0320 11:17:14.457533 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 11:17:15.456828 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 11:17:15.456837 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:17:16.457922 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 11:17:16.457925 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 11:17:16.457980 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:17:16.457998 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:17:16.472329 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:17:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:17:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 11:17:23.409793 543705 memory.go:184] no items to output this cycle
I0320 11:17:24.029674 543705 disk_info.go:125] begin check local disk info of client
I0320 11:17:24.032216 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:17:24.032222 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa380 0xc0001fa3c0]
E0320 11:17:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:17:33.409795 543705 memory.go:184] no items to output this cycle
I0320 11:17:33.409807 543705 cpu.go:275] no items to output this cycle
E0320 11:17:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:17:43.409800 543705 memory.go:191] Add success.
I0320 11:17:43.409805 543705 cpu.go:282] Add success.
I0320 11:17:43.419976 543705 net.go:648] Add success.
I0320 11:17:43.422740 543705 net.go:770] primary dev: ETH0
I0320 11:17:43.422753 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:17:43.422767 543705 net.go:698] Add success.
I0320 11:17:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:17:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:17:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:17:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:17:53.409776 543705 memory.go:184] no items to output this cycle
I0320 11:17:53.409783 543705 cpu.go:275] no items to output this cycle
E0320 11:18:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:18:03.409776 543705 memory.go:184] no items to output this cycle
I0320 11:18:03.409782 543705 cpu.go:275] no items to output this cycle
E0320 11:18:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:18:13.409812 543705 memory.go:191] Add success.
I0320 11:18:13.409818 543705 cpu.go:282] Add success.
W0320 11:18:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:18:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:18:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:18:13.420099 543705 net.go:648] Add success.
I0320 11:18:13.422906 543705 net.go:770] primary dev: ETH0
I0320 11:18:13.422919 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:18:13.422932 543705 net.go:698] Add success.
I0320 11:18:13.463611 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"553f1a5f-c339-494d-8ff6-510754dc6120","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 11:18:13.463641 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 11:18:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:18:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:18:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0320 11:18:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:18:14.456605 543705 disk_worker.go:494] system disk:vda1
I0320 11:18:14.456637 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:18:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:18:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:18:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:18:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:18:16.472381 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:18:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:18:23.409789 543705 cpu.go:275] no items to output this cycle
I0320 11:18:23.409792 543705 memory.go:184] no items to output this cycle
I0320 11:18:24.033673 543705 disk_info.go:125] begin check local disk info of client
I0320 11:18:24.036203 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:18:24.036208 543705 disk_info.go:196] parse disk info done, disk is : [0xc00046ec00 0xc00046ec40]
E0320 11:18:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:18:33.409770 543705 memory.go:184] no items to output this cycle
I0320 11:18:33.409802 543705 cpu.go:275] no items to output this cycle
I0320 11:18:38.446982 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 11:18:38.446989 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 11:18:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:18:43.410851 543705 memory.go:191] Add success.
I0320 11:18:43.409799 543705 cpu.go:282] Add success.
I0320 11:18:43.420539 543705 net.go:648] Add success.
I0320 11:18:43.423605 543705 net.go:770] primary dev: ETH0
I0320 11:18:43.423618 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:18:43.423631 543705 net.go:698] Add success.
I0320 11:18:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:18:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:18:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:18:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:18:53.409783 543705 memory.go:184] no items to output this cycle
I0320 11:18:53.409784 543705 cpu.go:275] no items to output this cycle
E0320 11:19:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:19:03.409766 543705 memory.go:184] no items to output this cycle
I0320 11:19:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 11:19:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:19:13.409827 543705 memory.go:191] Add success.
I0320 11:19:13.409840 543705 cpu.go:282] Add success.
W0320 11:19:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:19:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:19:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:19:13.420173 543705 net.go:648] Add success.
I0320 11:19:13.422842 543705 net.go:770] primary dev: ETH0
I0320 11:19:13.422855 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:19:13.422868 543705 net.go:698] Add success.
I0320 11:19:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:19:14.455106 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:19:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 11:19:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:19:14.456583 543705 disk_worker.go:494] system disk:vda1
I0320 11:19:14.456614 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:19:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:19:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:19:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:19:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:19:16.472395 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:19:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:19:23.409787 543705 memory.go:184] no items to output this cycle
I0320 11:19:23.409800 543705 cpu.go:275] no items to output this cycle
I0320 11:19:24.037672 543705 disk_info.go:125] begin check local disk info of client
I0320 11:19:24.040205 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:19:24.040210 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001f8cc0 0xc0001f8d00]
E0320 11:19:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:19:33.409772 543705 memory.go:184] no items to output this cycle
I0320 11:19:33.409802 543705 cpu.go:275] no items to output this cycle
E0320 11:19:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:19:43.409825 543705 memory.go:191] Add success.
I0320 11:19:43.409834 543705 cpu.go:282] Add success.
I0320 11:19:43.420022 543705 net.go:648] Add success.
I0320 11:19:43.422990 543705 net.go:770] primary dev: ETH0
I0320 11:19:43.423004 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:19:43.423018 543705 net.go:698] Add success.
I0320 11:19:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:19:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:19:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:19:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:19:53.409779 543705 memory.go:184] no items to output this cycle
I0320 11:19:53.409779 543705 cpu.go:275] no items to output this cycle
E0320 11:20:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:20:03.409768 543705 memory.go:184] no items to output this cycle
I0320 11:20:03.409790 543705 cpu.go:275] no items to output this cycle
E0320 11:20:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:20:13.409822 543705 memory.go:191] Add success.
I0320 11:20:13.409829 543705 cpu.go:282] Add success.
W0320 11:20:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:20:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:20:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:20:13.420158 543705 net.go:648] Add success.
I0320 11:20:13.422806 543705 net.go:770] primary dev: ETH0
I0320 11:20:13.422821 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:20:13.422833 543705 net.go:698] Add success.
I0320 11:20:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:20:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:20:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 11:20:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:20:14.456578 543705 disk_worker.go:494] system disk:vda1
I0320 11:20:14.456608 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:20:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:20:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:20:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:20:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:20:16.472405 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:20:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:20:23.409786 543705 cpu.go:275] no items to output this cycle
I0320 11:20:23.409789 543705 memory.go:184] no items to output this cycle
I0320 11:20:24.041672 543705 disk_info.go:125] begin check local disk info of client
I0320 11:20:24.044167 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:20:24.044172 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001f9000 0xc0001f9040]
E0320 11:20:33.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:20:33.409765 543705 memory.go:184] no items to output this cycle
I0320 11:20:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 11:20:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:20:43.409795 543705 memory.go:191] Add success.
I0320 11:20:43.409800 543705 cpu.go:282] Add success.
I0320 11:20:43.419894 543705 net.go:648] Add success.
I0320 11:20:43.422551 543705 net.go:770] primary dev: ETH0
I0320 11:20:43.422565 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:20:43.422577 543705 net.go:698] Add success.
I0320 11:20:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:20:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:20:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:20:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:20:53.409778 543705 cpu.go:275] no items to output this cycle
I0320 11:20:53.409789 543705 memory.go:184] no items to output this cycle
E0320 11:21:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:21:03.409772 543705 memory.go:184] no items to output this cycle
I0320 11:21:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 11:21:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:21:13.409791 543705 memory.go:191] Add success.
I0320 11:21:13.409808 543705 cpu.go:282] Add success.
W0320 11:21:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:21:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:21:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:21:13.420344 543705 net.go:648] Add success.
I0320 11:21:13.423228 543705 net.go:770] primary dev: ETH0
I0320 11:21:13.423240 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:21:13.423252 543705 net.go:698] Add success.
I0320 11:21:13.468648 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5540b5b8-1c03-488c-960f-3791ab041b0f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 11:21:13.468682 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 11:21:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:21:14.455203 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:21:14.455215 543705 disk_worker.go:708] disk space is not compliant
W0320 11:21:14.455217 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:21:14.456601 543705 disk_worker.go:494] system disk:vda1
I0320 11:21:14.456631 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:21:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:21:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:21:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:21:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:21:16.472403 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:21:23.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:21:23.409807 543705 memory.go:184] no items to output this cycle
I0320 11:21:23.409815 543705 cpu.go:275] no items to output this cycle
I0320 11:21:24.045670 543705 disk_info.go:125] begin check local disk info of client
I0320 11:21:24.048208 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:21:24.048213 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001f9a00 0xc0001f9a40]
E0320 11:21:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:21:33.409770 543705 memory.go:184] no items to output this cycle
I0320 11:21:33.409794 543705 cpu.go:275] no items to output this cycle
I0320 11:21:38.447999 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 11:21:38.448007 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 11:21:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:21:43.410669 543705 memory.go:191] Add success.
I0320 11:21:43.409823 543705 cpu.go:282] Add success.
I0320 11:21:43.420407 543705 net.go:648] Add success.
I0320 11:21:43.423066 543705 net.go:770] primary dev: ETH0
I0320 11:21:43.423080 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:21:43.423094 543705 net.go:698] Add success.
I0320 11:21:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:21:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:21:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:21:53.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:21:53.409763 543705 memory.go:184] no items to output this cycle
I0320 11:21:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 11:22:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:22:03.409779 543705 memory.go:184] no items to output this cycle
I0320 11:22:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 11:22:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:22:13.409819 543705 memory.go:191] Add success.
I0320 11:22:13.409823 543705 cpu.go:282] Add success.
W0320 11:22:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:22:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:22:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:22:13.420122 543705 net.go:648] Add success.
I0320 11:22:13.423347 543705 net.go:770] primary dev: ETH0
I0320 11:22:13.423359 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:22:13.423371 543705 net.go:698] Add success.
W0320 11:22:14.455140 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:22:14.455152 543705 disk_worker.go:708] disk space is not compliant
W0320 11:22:14.455155 543705 disk_worker.go:728] disk inode is not compliant
E0320 11:22:14.456897 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 11:22:14.456907 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 11:22:14.456913 543705 custom_config.go:64] query custom config with name: gpu
I0320 11:22:14.456985 543705 disk_worker.go:494] system disk:vda1
I0320 11:22:14.457013 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 11:22:15.456865 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 11:22:15.456875 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:22:16.457960 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 11:22:16.457960 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 11:22:16.458015 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:22:16.458034 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:22:16.472369 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:22:23.410430 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:22:23.410451 543705 memory.go:184] no items to output this cycle
I0320 11:22:23.410458 543705 cpu.go:275] no items to output this cycle
I0320 11:22:24.049669 543705 disk_info.go:125] begin check local disk info of client
I0320 11:22:24.052189 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:22:24.052194 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001abd40 0xc0001abd80]
E0320 11:22:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:22:33.409769 543705 memory.go:184] no items to output this cycle
I0320 11:22:33.409789 543705 cpu.go:275] no items to output this cycle
E0320 11:22:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:22:43.409815 543705 memory.go:191] Add success.
I0320 11:22:43.409820 543705 cpu.go:282] Add success.
I0320 11:22:43.419953 543705 net.go:648] Add success.
I0320 11:22:43.422745 543705 net.go:770] primary dev: ETH0
I0320 11:22:43.422760 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:22:43.422773 543705 net.go:698] Add success.
I0320 11:22:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:22:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:22:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:22:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:22:53.409794 543705 memory.go:184] no items to output this cycle
I0320 11:22:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 11:23:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:23:03.409777 543705 memory.go:184] no items to output this cycle
I0320 11:23:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 11:23:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:23:13.409879 543705 cpu.go:282] Add success.
I0320 11:23:13.409895 543705 memory.go:191] Add success.
W0320 11:23:13.409930 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:23:13.409945 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:23:13.409950 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:23:13.419733 543705 net.go:648] Add success.
I0320 11:23:13.422603 543705 net.go:770] primary dev: ETH0
I0320 11:23:13.422618 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:23:13.422631 543705 net.go:698] Add success.
I0320 11:23:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:23:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:23:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0320 11:23:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:23:14.456601 543705 disk_worker.go:494] system disk:vda1
I0320 11:23:14.456631 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:23:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:23:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:23:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:23:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:23:16.472423 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:23:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:23:23.409778 543705 memory.go:184] no items to output this cycle
I0320 11:23:23.409805 543705 cpu.go:275] no items to output this cycle
I0320 11:23:24.053673 543705 disk_info.go:125] begin check local disk info of client
I0320 11:23:24.056203 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:23:24.056209 543705 disk_info.go:196] parse disk info done, disk is : [0xc000549d40 0xc000549d80]
E0320 11:23:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:23:33.409765 543705 memory.go:184] no items to output this cycle
I0320 11:23:33.409805 543705 cpu.go:275] no items to output this cycle
E0320 11:23:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:23:43.409797 543705 memory.go:191] Add success.
I0320 11:23:43.409799 543705 cpu.go:282] Add success.
I0320 11:23:43.419871 543705 net.go:648] Add success.
I0320 11:23:43.422918 543705 net.go:770] primary dev: ETH0
I0320 11:23:43.422936 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:23:43.422952 543705 net.go:698] Add success.
I0320 11:23:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:23:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:23:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:23:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:23:53.409772 543705 memory.go:184] no items to output this cycle
I0320 11:23:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 11:24:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:24:03.409784 543705 memory.go:184] no items to output this cycle
I0320 11:24:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 11:24:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:24:13.409814 543705 memory.go:191] Add success.
I0320 11:24:13.409824 543705 cpu.go:282] Add success.
W0320 11:24:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:24:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:24:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:24:13.420510 543705 net.go:648] Add success.
I0320 11:24:13.423109 543705 net.go:770] primary dev: ETH0
I0320 11:24:13.423122 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:24:13.423133 543705 net.go:698] Add success.
I0320 11:24:13.467953 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"10d41998-f74d-47a7-9039-5b328228f190","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 11:24:13.467983 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 11:24:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:24:14.455103 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:24:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0320 11:24:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:24:14.456531 543705 disk_worker.go:494] system disk:vda1
I0320 11:24:14.456586 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:24:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:24:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:24:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:24:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:24:16.472400 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:24:23.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:24:23.409814 543705 memory.go:184] no items to output this cycle
I0320 11:24:23.409823 543705 cpu.go:275] no items to output this cycle
I0320 11:24:24.057691 543705 disk_info.go:125] begin check local disk info of client
I0320 11:24:24.060225 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:24:24.060231 543705 disk_info.go:196] parse disk info done, disk is : [0xc000515340 0xc000515380]
E0320 11:24:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:24:33.409795 543705 memory.go:184] no items to output this cycle
I0320 11:24:33.409806 543705 cpu.go:275] no items to output this cycle
I0320 11:24:38.448990 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 11:24:38.448997 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 11:24:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:24:43.410766 543705 memory.go:191] Add success.
I0320 11:24:43.409801 543705 cpu.go:282] Add success.
I0320 11:24:43.420572 543705 net.go:648] Add success.
I0320 11:24:43.423574 543705 net.go:770] primary dev: ETH0
I0320 11:24:43.423588 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:24:43.423600 543705 net.go:698] Add success.
I0320 11:24:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:24:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:24:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:24:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:24:53.409767 543705 memory.go:184] no items to output this cycle
I0320 11:24:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 11:25:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:25:03.409803 543705 memory.go:184] no items to output this cycle
I0320 11:25:03.409815 543705 cpu.go:275] no items to output this cycle
E0320 11:25:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:25:13.409797 543705 memory.go:191] Add success.
I0320 11:25:13.409797 543705 cpu.go:282] Add success.
W0320 11:25:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:25:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:25:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:25:13.420222 543705 net.go:648] Add success.
I0320 11:25:13.423320 543705 net.go:770] primary dev: ETH0
I0320 11:25:13.423335 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:25:13.423348 543705 net.go:698] Add success.
I0320 11:25:14.454947 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:25:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:25:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0320 11:25:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:25:14.456490 543705 disk_worker.go:494] system disk:vda1
I0320 11:25:14.456531 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:25:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:25:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:25:16.458023 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:25:16.458045 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:25:16.472383 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:25:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:25:23.409782 543705 memory.go:184] no items to output this cycle
I0320 11:25:23.409799 543705 cpu.go:275] no items to output this cycle
I0320 11:25:24.061673 543705 disk_info.go:125] begin check local disk info of client
I0320 11:25:24.064191 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:25:24.064197 543705 disk_info.go:196] parse disk info done, disk is : [0xc00046f1c0 0xc00046f200]
E0320 11:25:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:25:33.409799 543705 memory.go:184] no items to output this cycle
I0320 11:25:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 11:25:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:25:43.409815 543705 memory.go:191] Add success.
I0320 11:25:43.409821 543705 cpu.go:282] Add success.
I0320 11:25:43.420000 543705 net.go:648] Add success.
I0320 11:25:43.422749 543705 net.go:770] primary dev: ETH0
I0320 11:25:43.422763 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:25:43.422777 543705 net.go:698] Add success.
I0320 11:25:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:25:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:25:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:25:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:25:53.409787 543705 memory.go:184] no items to output this cycle
I0320 11:25:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 11:26:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:26:03.409786 543705 cpu.go:275] no items to output this cycle
I0320 11:26:03.409790 543705 memory.go:184] no items to output this cycle
E0320 11:26:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:26:13.409799 543705 memory.go:191] Add success.
I0320 11:26:13.409800 543705 cpu.go:282] Add success.
W0320 11:26:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:26:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:26:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:26:13.420401 543705 net.go:648] Add success.
I0320 11:26:13.423248 543705 net.go:770] primary dev: ETH0
I0320 11:26:13.423263 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:26:13.423276 543705 net.go:698] Add success.
I0320 11:26:14.453955 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:26:14.455160 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:26:14.455222 543705 disk_worker.go:708] disk space is not compliant
W0320 11:26:14.455225 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:26:14.456551 543705 disk_worker.go:494] system disk:vda1
I0320 11:26:14.456596 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:26:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:26:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:26:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:26:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:26:16.472402 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:26:23.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:26:23.409814 543705 memory.go:184] no items to output this cycle
I0320 11:26:23.409824 543705 cpu.go:275] no items to output this cycle
I0320 11:26:24.065670 543705 disk_info.go:125] begin check local disk info of client
I0320 11:26:24.068241 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:26:24.068248 543705 disk_info.go:196] parse disk info done, disk is : [0xc00046f3c0 0xc00046f400]
E0320 11:26:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:26:33.409778 543705 memory.go:184] no items to output this cycle
I0320 11:26:33.409781 543705 cpu.go:275] no items to output this cycle
E0320 11:26:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:26:43.409787 543705 memory.go:191] Add success.
I0320 11:26:43.409800 543705 cpu.go:282] Add success.
I0320 11:26:43.420076 543705 net.go:648] Add success.
I0320 11:26:43.422799 543705 net.go:770] primary dev: ETH0
I0320 11:26:43.422814 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:26:43.422828 543705 net.go:698] Add success.
I0320 11:26:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:26:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:26:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:26:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:26:53.409794 543705 memory.go:184] no items to output this cycle
I0320 11:26:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 11:27:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:27:03.409782 543705 memory.go:184] no items to output this cycle
I0320 11:27:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 11:27:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:27:13.409811 543705 memory.go:191] Add success.
I0320 11:27:13.409813 543705 cpu.go:282] Add success.
W0320 11:27:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:27:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:27:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:27:13.420269 543705 net.go:648] Add success.
I0320 11:27:13.423324 543705 net.go:770] primary dev: ETH0
I0320 11:27:13.423337 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:27:13.423348 543705 net.go:698] Add success.
I0320 11:27:13.430053 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 11:27:13.453295 543705 event_worker.go:152] Polling the log file for events...
I0320 11:27:13.468647 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1eff5136-9e64-4d3e-9d8a-5216d777b707","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 11:27:13.468680 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 11:27:14.455139 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:27:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0320 11:27:14.455214 543705 disk_worker.go:728] disk inode is not compliant
E0320 11:27:14.455907 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 11:27:14.455917 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 11:27:14.455923 543705 custom_config.go:64] query custom config with name: gpu
I0320 11:27:14.456760 543705 disk_worker.go:494] system disk:vda1
I0320 11:27:14.456795 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 11:27:15.456815 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 11:27:15.456824 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:27:16.457922 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 11:27:16.457922 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 11:27:16.457977 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:27:16.457997 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:27:16.472394 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:27:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:27:23.409788 543705 memory.go:184] no items to output this cycle
I0320 11:27:23.409788 543705 cpu.go:275] no items to output this cycle
I0320 11:27:24.069668 543705 disk_info.go:125] begin check local disk info of client
I0320 11:27:24.072273 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:27:24.072280 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047ee00 0xc00047ee40]
E0320 11:27:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:27:33.409790 543705 memory.go:184] no items to output this cycle
I0320 11:27:33.409808 543705 cpu.go:275] no items to output this cycle
I0320 11:27:38.450021 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 11:27:38.450028 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 11:27:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:27:43.410635 543705 memory.go:191] Add success.
I0320 11:27:43.409792 543705 cpu.go:282] Add success.
I0320 11:27:43.420400 543705 net.go:648] Add success.
I0320 11:27:43.423263 543705 net.go:770] primary dev: ETH0
I0320 11:27:43.423276 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:27:43.423291 543705 net.go:698] Add success.
I0320 11:27:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:27:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:27:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:27:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:27:53.409770 543705 memory.go:184] no items to output this cycle
I0320 11:27:53.409780 543705 cpu.go:275] no items to output this cycle
E0320 11:28:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:28:03.409800 543705 memory.go:184] no items to output this cycle
I0320 11:28:03.409815 543705 cpu.go:275] no items to output this cycle
E0320 11:28:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:28:13.409780 543705 memory.go:191] Add success.
I0320 11:28:13.409803 543705 cpu.go:282] Add success.
W0320 11:28:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:28:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:28:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:28:13.420072 543705 net.go:648] Add success.
I0320 11:28:13.422768 543705 net.go:770] primary dev: ETH0
I0320 11:28:13.422783 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:28:13.422797 543705 net.go:698] Add success.
I0320 11:28:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:28:14.455151 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:28:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0320 11:28:14.455165 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:28:14.456535 543705 disk_worker.go:494] system disk:vda1
I0320 11:28:14.456582 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:28:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:28:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:28:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:28:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:28:16.472385 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:28:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:28:23.409789 543705 memory.go:184] no items to output this cycle
I0320 11:28:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 11:28:24.073671 543705 disk_info.go:125] begin check local disk info of client
I0320 11:28:24.076265 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:28:24.076270 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001e2340 0xc0001e2380]
E0320 11:28:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:28:33.409788 543705 memory.go:184] no items to output this cycle
I0320 11:28:33.409789 543705 cpu.go:275] no items to output this cycle
E0320 11:28:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:28:43.409795 543705 memory.go:191] Add success.
I0320 11:28:43.409796 543705 cpu.go:282] Add success.
I0320 11:28:43.420061 543705 net.go:648] Add success.
I0320 11:28:43.422743 543705 net.go:770] primary dev: ETH0
I0320 11:28:43.422757 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:28:43.422768 543705 net.go:698] Add success.
I0320 11:28:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:28:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:28:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:28:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:28:53.409777 543705 memory.go:184] no items to output this cycle
I0320 11:28:53.409779 543705 cpu.go:275] no items to output this cycle
E0320 11:29:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:29:03.409786 543705 memory.go:184] no items to output this cycle
I0320 11:29:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 11:29:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:29:13.409805 543705 memory.go:191] Add success.
I0320 11:29:13.409812 543705 cpu.go:282] Add success.
W0320 11:29:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:29:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:29:13.409856 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:29:13.420271 543705 net.go:648] Add success.
I0320 11:29:13.422918 543705 net.go:770] primary dev: ETH0
I0320 11:29:13.422933 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:29:13.422945 543705 net.go:698] Add success.
I0320 11:29:14.454954 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:29:14.455116 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:29:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 11:29:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:29:14.456590 543705 disk_worker.go:494] system disk:vda1
I0320 11:29:14.456619 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:29:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:29:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:29:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:29:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:29:16.472423 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:29:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:29:23.409802 543705 memory.go:184] no items to output this cycle
I0320 11:29:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 11:29:24.077671 543705 disk_info.go:125] begin check local disk info of client
I0320 11:29:24.080184 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:29:24.080190 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5900 0xc0000c5940]
E0320 11:29:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:29:33.409808 543705 memory.go:184] no items to output this cycle
I0320 11:29:33.409824 543705 cpu.go:275] no items to output this cycle
E0320 11:29:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:29:43.409787 543705 memory.go:191] Add success.
I0320 11:29:43.409810 543705 cpu.go:282] Add success.
I0320 11:29:43.419961 543705 net.go:648] Add success.
I0320 11:29:43.422921 543705 net.go:770] primary dev: ETH0
I0320 11:29:43.422935 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:29:43.422950 543705 net.go:698] Add success.
I0320 11:29:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:29:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:29:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:29:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:29:53.409792 543705 memory.go:184] no items to output this cycle
I0320 11:29:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 11:30:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:30:03.409779 543705 memory.go:184] no items to output this cycle
I0320 11:30:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 11:30:13.409856 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:30:13.409884 543705 memory.go:191] Add success.
I0320 11:30:13.409914 543705 cpu.go:282] Add success.
W0320 11:30:13.409921 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:30:13.409935 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:30:13.409942 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:30:13.419722 543705 net.go:648] Add success.
I0320 11:30:13.423270 543705 net.go:770] primary dev: ETH0
I0320 11:30:13.423281 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:30:13.423292 543705 net.go:698] Add success.
I0320 11:30:13.463496 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2d4b875a-5e7d-4a06-8e46-12bb40f63ab6","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 11:30:13.463527 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 11:30:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:30:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:30:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0320 11:30:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:30:14.456672 543705 disk_worker.go:494] system disk:vda1
I0320 11:30:14.456701 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:30:15.455616 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:30:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:30:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:30:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:30:16.472421 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:30:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:30:23.409798 543705 cpu.go:275] no items to output this cycle
I0320 11:30:23.409800 543705 memory.go:184] no items to output this cycle
I0320 11:30:24.081676 543705 disk_info.go:125] begin check local disk info of client
I0320 11:30:24.084211 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:30:24.084217 543705 disk_info.go:196] parse disk info done, disk is : [0xc000287300 0xc000287340]
E0320 11:30:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:30:33.409784 543705 memory.go:184] no items to output this cycle
I0320 11:30:33.409790 543705 cpu.go:275] no items to output this cycle
I0320 11:30:38.451009 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 11:30:38.451016 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 11:30:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:30:43.410830 543705 memory.go:191] Add success.
I0320 11:30:43.409803 543705 cpu.go:282] Add success.
I0320 11:30:43.420539 543705 net.go:648] Add success.
I0320 11:30:43.423483 543705 net.go:770] primary dev: ETH0
I0320 11:30:43.423496 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:30:43.423509 543705 net.go:698] Add success.
I0320 11:30:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:30:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:30:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:30:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:30:53.409771 543705 memory.go:184] no items to output this cycle
I0320 11:30:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 11:31:03.409892 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:31:03.409914 543705 memory.go:184] no items to output this cycle
I0320 11:31:03.409926 543705 cpu.go:275] no items to output this cycle
E0320 11:31:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:31:13.409780 543705 memory.go:191] Add success.
I0320 11:31:13.409804 543705 cpu.go:282] Add success.
W0320 11:31:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:31:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:31:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:31:13.420167 543705 net.go:648] Add success.
I0320 11:31:13.422764 543705 net.go:770] primary dev: ETH0
I0320 11:31:13.422777 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:31:13.422788 543705 net.go:698] Add success.
I0320 11:31:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:31:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:31:14.455140 543705 disk_worker.go:708] disk space is not compliant
W0320 11:31:14.455142 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:31:14.456432 543705 disk_worker.go:494] system disk:vda1
I0320 11:31:14.456476 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:31:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:31:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:31:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:31:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:31:16.472401 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:31:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:31:23.409795 543705 memory.go:184] no items to output this cycle
I0320 11:31:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 11:31:24.085669 543705 disk_info.go:125] begin check local disk info of client
I0320 11:31:24.088204 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:31:24.088210 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a1a80 0xc0002a1ac0]
E0320 11:31:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:31:33.409803 543705 memory.go:184] no items to output this cycle
I0320 11:31:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 11:31:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:31:43.409821 543705 memory.go:191] Add success.
I0320 11:31:43.409831 543705 cpu.go:282] Add success.
I0320 11:31:43.420001 543705 net.go:648] Add success.
I0320 11:31:43.422522 543705 net.go:770] primary dev: ETH0
I0320 11:31:43.422536 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:31:43.422549 543705 net.go:698] Add success.
I0320 11:31:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:31:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:31:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:31:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:31:53.409796 543705 memory.go:184] no items to output this cycle
I0320 11:31:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 11:32:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:32:03.409803 543705 memory.go:184] no items to output this cycle
I0320 11:32:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 11:32:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:32:13.409811 543705 memory.go:191] Add success.
I0320 11:32:13.409814 543705 cpu.go:282] Add success.
W0320 11:32:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:32:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:32:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:32:13.420084 543705 net.go:648] Add success.
I0320 11:32:13.423605 543705 net.go:770] primary dev: ETH0
I0320 11:32:13.423618 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:32:13.423629 543705 net.go:698] Add success.
W0320 11:32:14.455077 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:32:14.455133 543705 disk_worker.go:708] disk space is not compliant
W0320 11:32:14.455136 543705 disk_worker.go:728] disk inode is not compliant
E0320 11:32:14.456874 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 11:32:14.456883 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 11:32:14.456889 543705 custom_config.go:64] query custom config with name: gpu
I0320 11:32:14.456964 543705 disk_worker.go:494] system disk:vda1
I0320 11:32:14.456993 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 11:32:15.456856 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 11:32:15.456864 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:32:16.457934 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 11:32:16.457934 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 11:32:16.457989 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:32:16.458008 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:32:16.472327 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:32:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:32:23.409796 543705 memory.go:184] no items to output this cycle
I0320 11:32:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 11:32:24.089671 543705 disk_info.go:125] begin check local disk info of client
I0320 11:32:24.092268 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:32:24.092274 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b9380 0xc0003b93c0]
E0320 11:32:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:32:33.409782 543705 memory.go:184] no items to output this cycle
I0320 11:32:33.409782 543705 cpu.go:275] no items to output this cycle
E0320 11:32:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:32:43.409794 543705 cpu.go:282] Add success.
I0320 11:32:43.409798 543705 memory.go:191] Add success.
I0320 11:32:43.419900 543705 net.go:648] Add success.
I0320 11:32:43.422757 543705 net.go:770] primary dev: ETH0
I0320 11:32:43.422772 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:32:43.422786 543705 net.go:698] Add success.
I0320 11:32:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:32:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:32:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:32:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:32:53.409782 543705 memory.go:184] no items to output this cycle
I0320 11:32:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 11:33:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:33:03.409780 543705 memory.go:184] no items to output this cycle
I0320 11:33:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 11:33:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:33:13.409796 543705 memory.go:191] Add success.
I0320 11:33:13.409798 543705 cpu.go:282] Add success.
W0320 11:33:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:33:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:33:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:33:13.420495 543705 net.go:648] Add success.
I0320 11:33:13.423184 543705 net.go:770] primary dev: ETH0
I0320 11:33:13.423196 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:33:13.423208 543705 net.go:698] Add success.
I0320 11:33:13.960578 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"64c2451f-1b7d-47b9-b482-2071c2da8108","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 11:33:13.960623 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 11:33:14.454679 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:33:14.454853 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:33:14.454864 543705 disk_worker.go:708] disk space is not compliant
W0320 11:33:14.454866 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:33:14.456198 543705 disk_worker.go:494] system disk:vda1
I0320 11:33:14.456252 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:33:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:33:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:33:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:33:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:33:16.472373 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:33:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:33:23.409799 543705 memory.go:184] no items to output this cycle
I0320 11:33:23.409802 543705 cpu.go:275] no items to output this cycle
I0320 11:33:24.093694 543705 disk_info.go:125] begin check local disk info of client
I0320 11:33:24.096200 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:33:24.096205 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003446c0 0xc000344700]
E0320 11:33:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:33:33.409772 543705 memory.go:184] no items to output this cycle
I0320 11:33:33.409801 543705 cpu.go:275] no items to output this cycle
I0320 11:33:38.452024 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 11:33:38.452032 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 11:33:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:33:43.410613 543705 memory.go:191] Add success.
I0320 11:33:43.409821 543705 cpu.go:282] Add success.
I0320 11:33:43.420363 543705 net.go:648] Add success.
I0320 11:33:43.422858 543705 net.go:770] primary dev: ETH0
I0320 11:33:43.422873 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:33:43.422887 543705 net.go:698] Add success.
I0320 11:33:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:33:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:33:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:33:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:33:53.409782 543705 memory.go:184] no items to output this cycle
I0320 11:33:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 11:34:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:34:03.409806 543705 memory.go:184] no items to output this cycle
I0320 11:34:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 11:34:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:34:13.409793 543705 memory.go:191] Add success.
W0320 11:34:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 11:34:13.409821 543705 cpu.go:282] Add success.
W0320 11:34:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:34:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:34:13.420143 543705 net.go:648] Add success.
I0320 11:34:13.423024 543705 net.go:770] primary dev: ETH0
I0320 11:34:13.423039 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:34:13.423052 543705 net.go:698] Add success.
I0320 11:34:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:34:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:34:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 11:34:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:34:14.456563 543705 disk_worker.go:494] system disk:vda1
I0320 11:34:14.456592 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:34:15.455974 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:34:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:34:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:34:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:34:16.472389 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:34:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:34:23.409795 543705 memory.go:184] no items to output this cycle
I0320 11:34:23.409812 543705 cpu.go:275] no items to output this cycle
I0320 11:34:24.097675 543705 disk_info.go:125] begin check local disk info of client
I0320 11:34:24.100220 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:34:24.100225 543705 disk_info.go:196] parse disk info done, disk is : [0xc000344200 0xc000344240]
E0320 11:34:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:34:33.409806 543705 memory.go:184] no items to output this cycle
I0320 11:34:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 11:34:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:34:43.409791 543705 memory.go:191] Add success.
I0320 11:34:43.409825 543705 cpu.go:282] Add success.
I0320 11:34:43.419893 543705 net.go:648] Add success.
I0320 11:34:43.422877 543705 net.go:770] primary dev: ETH0
I0320 11:34:43.422891 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:34:43.422904 543705 net.go:698] Add success.
I0320 11:34:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:34:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:34:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:34:53.410360 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:34:53.410381 543705 memory.go:184] no items to output this cycle
I0320 11:34:53.410383 543705 cpu.go:275] no items to output this cycle
E0320 11:35:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:35:03.409808 543705 memory.go:184] no items to output this cycle
I0320 11:35:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 11:35:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:35:13.409789 543705 memory.go:191] Add success.
I0320 11:35:13.409810 543705 cpu.go:282] Add success.
W0320 11:35:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:35:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:35:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:35:13.420369 543705 net.go:648] Add success.
I0320 11:35:13.423097 543705 net.go:770] primary dev: ETH0
I0320 11:35:13.423111 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:35:13.423122 543705 net.go:698] Add success.
I0320 11:35:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:35:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:35:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0320 11:35:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:35:14.456585 543705 disk_worker.go:494] system disk:vda1
I0320 11:35:14.456614 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:35:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:35:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:35:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:35:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:35:16.472378 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:35:23.409808 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:35:23.409829 543705 memory.go:184] no items to output this cycle
I0320 11:35:23.409833 543705 cpu.go:275] no items to output this cycle
I0320 11:35:24.101671 543705 disk_info.go:125] begin check local disk info of client
I0320 11:35:24.104213 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:35:24.104218 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c52c0 0xc0000c5300]
E0320 11:35:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:35:33.409791 543705 memory.go:184] no items to output this cycle
I0320 11:35:33.409794 543705 cpu.go:275] no items to output this cycle
E0320 11:35:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:35:43.409828 543705 memory.go:191] Add success.
I0320 11:35:43.409829 543705 cpu.go:282] Add success.
I0320 11:35:43.420036 543705 net.go:648] Add success.
I0320 11:35:43.422935 543705 net.go:770] primary dev: ETH0
I0320 11:35:43.422950 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:35:43.422964 543705 net.go:698] Add success.
I0320 11:35:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:35:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:35:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:35:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:35:53.409794 543705 memory.go:184] no items to output this cycle
I0320 11:35:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 11:36:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:36:03.409782 543705 memory.go:184] no items to output this cycle
I0320 11:36:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 11:36:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:36:13.409793 543705 memory.go:191] Add success.
I0320 11:36:13.409794 543705 cpu.go:282] Add success.
W0320 11:36:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:36:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:36:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:36:13.420245 543705 net.go:648] Add success.
I0320 11:36:13.423304 543705 net.go:770] primary dev: ETH0
I0320 11:36:13.423317 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:36:13.423328 543705 net.go:698] Add success.
I0320 11:36:13.463037 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5a5ec593-907b-4aa6-8e66-abccbd5e9ed2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 11:36:13.463069 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 11:36:14.454980 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:36:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:36:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0320 11:36:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:36:14.456579 543705 disk_worker.go:494] system disk:vda1
I0320 11:36:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:36:15.455608 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:36:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:36:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:36:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:36:16.472425 543705 disk_local_worker.go:436] Get disk info: []
I0320 11:36:23.410514 543705 cpu.go:275] no items to output this cycle
E0320 11:36:23.410522 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:36:23.410540 543705 memory.go:184] no items to output this cycle
I0320 11:36:24.105676 543705 disk_info.go:125] begin check local disk info of client
I0320 11:36:24.108454 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:36:24.108460 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5980 0xc0000c59c0]
E0320 11:36:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:36:33.409797 543705 memory.go:184] no items to output this cycle
I0320 11:36:33.409810 543705 cpu.go:275] no items to output this cycle
I0320 11:36:38.453012 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 11:36:38.453018 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 11:36:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:36:43.410984 543705 memory.go:191] Add success.
I0320 11:36:43.409797 543705 cpu.go:282] Add success.
I0320 11:36:43.420694 543705 net.go:648] Add success.
I0320 11:36:43.423831 543705 net.go:770] primary dev: ETH0
I0320 11:36:43.423844 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:36:43.423858 543705 net.go:698] Add success.
I0320 11:36:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:36:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:36:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:36:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:36:53.409781 543705 memory.go:184] no items to output this cycle
I0320 11:36:53.409782 543705 cpu.go:275] no items to output this cycle
E0320 11:37:03.409843 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:37:03.409866 543705 memory.go:184] no items to output this cycle
I0320 11:37:03.409893 543705 cpu.go:275] no items to output this cycle
E0320 11:37:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:37:13.409817 543705 memory.go:191] Add success.
I0320 11:37:13.409827 543705 cpu.go:282] Add success.
W0320 11:37:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:37:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:37:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:37:13.420141 543705 net.go:648] Add success.
I0320 11:37:13.422803 543705 net.go:770] primary dev: ETH0
I0320 11:37:13.422815 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:37:13.422828 543705 net.go:698] Add success.
I0320 11:37:13.453422 543705 event_worker.go:152] Polling the log file for events...
W0320 11:37:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:37:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0320 11:37:14.455164 543705 disk_worker.go:728] disk inode is not compliant
E0320 11:37:14.456862 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 11:37:14.456871 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 11:37:14.456877 543705 custom_config.go:64] query custom config with name: gpu
I0320 11:37:14.456969 543705 disk_worker.go:494] system disk:vda1
I0320 11:37:14.456998 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 11:37:15.456842 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 11:37:15.456851 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:37:16.457918 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 11:37:16.457918 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 11:37:16.457982 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:37:16.458004 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:37:16.472336 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:37:23.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:37:23.409817 543705 memory.go:184] no items to output this cycle
I0320 11:37:23.409830 543705 cpu.go:275] no items to output this cycle
I0320 11:37:24.109670 543705 disk_info.go:125] begin check local disk info of client
I0320 11:37:24.112232 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:37:24.112238 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007a700 0xc00007a740]
E0320 11:37:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:37:33.409770 543705 memory.go:184] no items to output this cycle
I0320 11:37:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 11:37:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:37:43.409825 543705 memory.go:191] Add success.
I0320 11:37:43.409832 543705 cpu.go:282] Add success.
I0320 11:37:43.419994 543705 net.go:648] Add success.
I0320 11:37:43.422799 543705 net.go:770] primary dev: ETH0
I0320 11:37:43.422815 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:37:43.422829 543705 net.go:698] Add success.
I0320 11:37:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:37:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:37:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:37:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:37:53.409802 543705 memory.go:184] no items to output this cycle
I0320 11:37:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 11:38:03.409879 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:38:03.409897 543705 cpu.go:275] no items to output this cycle
I0320 11:38:03.409924 543705 memory.go:184] no items to output this cycle
E0320 11:38:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:38:13.409825 543705 memory.go:191] Add success.
I0320 11:38:13.409833 543705 cpu.go:282] Add success.
W0320 11:38:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:38:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:38:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:38:13.420201 543705 net.go:648] Add success.
I0320 11:38:13.422999 543705 net.go:770] primary dev: ETH0
I0320 11:38:13.423021 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:38:13.423038 543705 net.go:698] Add success.
I0320 11:38:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:38:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:38:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 11:38:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:38:14.456520 543705 disk_worker.go:494] system disk:vda1
I0320 11:38:14.456565 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:38:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:38:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:38:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:38:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:38:16.472393 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:38:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:38:23.409794 543705 memory.go:184] no items to output this cycle
I0320 11:38:23.409827 543705 cpu.go:275] no items to output this cycle
I0320 11:38:24.113675 543705 disk_info.go:125] begin check local disk info of client
I0320 11:38:24.116225 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:38:24.116231 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c55c0 0xc0000c5600]
E0320 11:38:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:38:33.409800 543705 memory.go:184] no items to output this cycle
I0320 11:38:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 11:38:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:38:43.409791 543705 memory.go:191] Add success.
I0320 11:38:43.409815 543705 cpu.go:282] Add success.
I0320 11:38:43.420016 543705 net.go:648] Add success.
I0320 11:38:43.422929 543705 net.go:770] primary dev: ETH0
I0320 11:38:43.422944 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:38:43.422973 543705 net.go:698] Add success.
I0320 11:38:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:38:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:38:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:38:53.410374 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:38:53.410392 543705 memory.go:184] no items to output this cycle
I0320 11:38:53.410421 543705 cpu.go:275] no items to output this cycle
E0320 11:39:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:39:03.409775 543705 memory.go:184] no items to output this cycle
I0320 11:39:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 11:39:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:39:13.409797 543705 memory.go:191] Add success.
I0320 11:39:13.409797 543705 cpu.go:282] Add success.
W0320 11:39:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:39:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:39:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:39:13.420163 543705 net.go:648] Add success.
I0320 11:39:13.422778 543705 net.go:770] primary dev: ETH0
I0320 11:39:13.422792 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:39:13.422804 543705 net.go:698] Add success.
I0320 11:39:13.877834 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1722ee1b-db6b-456a-b2db-4a42dcb91337","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 11:39:13.877873 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 11:39:14.454725 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:39:14.454963 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:39:14.454974 543705 disk_worker.go:708] disk space is not compliant
W0320 11:39:14.454976 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:39:14.456461 543705 disk_worker.go:494] system disk:vda1
I0320 11:39:14.456490 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:39:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:39:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:39:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:39:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:39:16.472396 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:39:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:39:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 11:39:23.409797 543705 memory.go:184] no items to output this cycle
I0320 11:39:24.117674 543705 disk_info.go:125] begin check local disk info of client
I0320 11:39:24.120172 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:39:24.120178 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002ba500 0xc0002ba540]
E0320 11:39:33.409828 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:39:33.409857 543705 memory.go:184] no items to output this cycle
I0320 11:39:33.409894 543705 cpu.go:275] no items to output this cycle
I0320 11:39:38.454045 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 11:39:38.454053 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 11:39:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:39:43.410730 543705 memory.go:191] Add success.
I0320 11:39:43.409810 543705 cpu.go:282] Add success.
I0320 11:39:43.420420 543705 net.go:648] Add success.
I0320 11:39:43.423196 543705 net.go:770] primary dev: ETH0
I0320 11:39:43.423212 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:39:43.423228 543705 net.go:698] Add success.
I0320 11:39:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:39:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:39:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:39:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:39:53.409781 543705 memory.go:184] no items to output this cycle
I0320 11:39:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 11:40:03.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:40:03.409766 543705 memory.go:184] no items to output this cycle
I0320 11:40:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 11:40:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:40:13.409781 543705 memory.go:191] Add success.
W0320 11:40:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 11:40:13.409811 543705 cpu.go:282] Add success.
W0320 11:40:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:40:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:40:13.420092 543705 net.go:648] Add success.
I0320 11:40:13.423108 543705 net.go:770] primary dev: ETH0
I0320 11:40:13.423121 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:40:13.423133 543705 net.go:698] Add success.
I0320 11:40:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:40:14.455198 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:40:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0320 11:40:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:40:14.456584 543705 disk_worker.go:494] system disk:vda1
I0320 11:40:14.456614 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:40:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:40:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:40:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:40:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:40:16.472492 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:40:23.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:40:23.409812 543705 memory.go:184] no items to output this cycle
I0320 11:40:23.409820 543705 cpu.go:275] no items to output this cycle
I0320 11:40:24.121669 543705 disk_info.go:125] begin check local disk info of client
I0320 11:40:24.124205 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:40:24.124210 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5c80 0xc0000c5cc0]
E0320 11:40:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:40:33.409804 543705 memory.go:184] no items to output this cycle
I0320 11:40:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 11:40:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:40:43.409779 543705 memory.go:191] Add success.
I0320 11:40:43.409802 543705 cpu.go:282] Add success.
I0320 11:40:43.419878 543705 net.go:648] Add success.
I0320 11:40:43.422886 543705 net.go:770] primary dev: ETH0
I0320 11:40:43.422900 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:40:43.422912 543705 net.go:698] Add success.
I0320 11:40:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:40:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:40:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:40:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:40:53.409781 543705 memory.go:184] no items to output this cycle
I0320 11:40:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 11:41:03.409871 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:41:03.409888 543705 memory.go:184] no items to output this cycle
I0320 11:41:03.409935 543705 cpu.go:275] no items to output this cycle
E0320 11:41:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:41:13.409810 543705 memory.go:191] Add success.
I0320 11:41:13.409824 543705 cpu.go:282] Add success.
W0320 11:41:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:41:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:41:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:41:13.420372 543705 net.go:648] Add success.
I0320 11:41:13.423790 543705 net.go:770] primary dev: ETH0
I0320 11:41:13.423803 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:41:13.423814 543705 net.go:698] Add success.
I0320 11:41:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:41:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:41:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 11:41:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:41:14.456570 543705 disk_worker.go:494] system disk:vda1
I0320 11:41:14.456600 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:41:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:41:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:41:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:41:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:41:16.472387 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:41:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:41:23.409777 543705 memory.go:184] no items to output this cycle
I0320 11:41:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 11:41:24.125673 543705 disk_info.go:125] begin check local disk info of client
I0320 11:41:24.128269 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:41:24.128274 543705 disk_info.go:196] parse disk info done, disk is : [0xc000475000 0xc000475040]
E0320 11:41:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:41:33.409772 543705 memory.go:184] no items to output this cycle
I0320 11:41:33.409794 543705 cpu.go:275] no items to output this cycle
E0320 11:41:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:41:43.409826 543705 memory.go:191] Add success.
I0320 11:41:43.409831 543705 cpu.go:282] Add success.
I0320 11:41:43.419950 543705 net.go:648] Add success.
I0320 11:41:43.422718 543705 net.go:770] primary dev: ETH0
I0320 11:41:43.422731 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:41:43.422743 543705 net.go:698] Add success.
I0320 11:41:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:41:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:41:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:41:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:41:53.409796 543705 memory.go:184] no items to output this cycle
I0320 11:41:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 11:42:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:42:03.409782 543705 memory.go:184] no items to output this cycle
I0320 11:42:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 11:42:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:42:13.409809 543705 memory.go:191] Add success.
I0320 11:42:13.409819 543705 cpu.go:282] Add success.
W0320 11:42:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:42:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:42:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:42:13.420122 543705 net.go:648] Add success.
I0320 11:42:13.422990 543705 net.go:770] primary dev: ETH0
I0320 11:42:13.423003 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:42:13.423014 543705 net.go:698] Add success.
I0320 11:42:13.471448 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"effc8314-0db7-4aea-a43a-881fb2d259da","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 11:42:13.471480 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 11:42:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:42:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0320 11:42:14.455184 543705 disk_worker.go:728] disk inode is not compliant
E0320 11:42:14.456168 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 11:42:14.456178 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 11:42:14.456183 543705 custom_config.go:64] query custom config with name: gpu
I0320 11:42:14.456487 543705 disk_worker.go:494] system disk:vda1
I0320 11:42:14.456518 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 11:42:15.456838 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 11:42:15.456847 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:42:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 11:42:16.457989 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 11:42:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:42:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:42:16.472366 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:42:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:42:23.409795 543705 memory.go:184] no items to output this cycle
I0320 11:42:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 11:42:24.129669 543705 disk_info.go:125] begin check local disk info of client
I0320 11:42:24.132296 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:42:24.132302 543705 disk_info.go:196] parse disk info done, disk is : [0xc000233700 0xc000233740]
E0320 11:42:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:42:33.409778 543705 memory.go:184] no items to output this cycle
I0320 11:42:33.409787 543705 cpu.go:275] no items to output this cycle
I0320 11:42:38.455027 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 11:42:38.455034 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 11:42:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:42:43.410795 543705 memory.go:191] Add success.
I0320 11:42:43.409802 543705 cpu.go:282] Add success.
I0320 11:42:43.420584 543705 net.go:648] Add success.
I0320 11:42:43.423629 543705 net.go:770] primary dev: ETH0
I0320 11:42:43.423644 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:42:43.423658 543705 net.go:698] Add success.
I0320 11:42:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:42:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:42:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:42:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:42:53.409766 543705 memory.go:184] no items to output this cycle
I0320 11:42:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 11:43:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:43:03.409800 543705 memory.go:184] no items to output this cycle
I0320 11:43:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 11:43:13.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:43:13.409777 543705 memory.go:191] Add success.
I0320 11:43:13.409797 543705 cpu.go:282] Add success.
W0320 11:43:13.409803 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:43:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:43:13.409817 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:43:13.420175 543705 net.go:648] Add success.
I0320 11:43:13.422825 543705 net.go:770] primary dev: ETH0
I0320 11:43:13.422838 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:43:13.422849 543705 net.go:698] Add success.
I0320 11:43:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:43:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:43:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 11:43:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:43:14.456589 543705 disk_worker.go:494] system disk:vda1
I0320 11:43:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:43:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:43:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:43:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:43:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:43:16.472409 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:43:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:43:23.409783 543705 memory.go:184] no items to output this cycle
I0320 11:43:23.409854 543705 cpu.go:275] no items to output this cycle
I0320 11:43:24.133673 543705 disk_info.go:125] begin check local disk info of client
I0320 11:43:24.136174 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:43:24.136180 543705 disk_info.go:196] parse disk info done, disk is : [0xc000487500 0xc000487540]
E0320 11:43:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:43:33.409768 543705 memory.go:184] no items to output this cycle
I0320 11:43:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 11:43:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:43:43.409830 543705 memory.go:191] Add success.
I0320 11:43:43.409837 543705 cpu.go:282] Add success.
I0320 11:43:43.420076 543705 net.go:648] Add success.
I0320 11:43:43.423089 543705 net.go:770] primary dev: ETH0
I0320 11:43:43.423105 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:43:43.423120 543705 net.go:698] Add success.
I0320 11:43:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:43:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:43:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:43:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:43:53.409799 543705 memory.go:184] no items to output this cycle
I0320 11:43:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 11:44:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:44:03.409785 543705 memory.go:184] no items to output this cycle
I0320 11:44:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 11:44:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:44:13.409806 543705 memory.go:191] Add success.
I0320 11:44:13.409818 543705 cpu.go:282] Add success.
W0320 11:44:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:44:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:44:13.409859 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:44:13.420415 543705 net.go:648] Add success.
I0320 11:44:13.423121 543705 net.go:770] primary dev: ETH0
I0320 11:44:13.423133 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:44:13.423145 543705 net.go:698] Add success.
I0320 11:44:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:44:14.455201 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:44:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0320 11:44:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:44:14.456602 543705 disk_worker.go:494] system disk:vda1
I0320 11:44:14.456634 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:44:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:44:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:44:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:44:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:44:16.472375 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:44:23.409907 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:44:23.409939 543705 memory.go:184] no items to output this cycle
I0320 11:44:23.409948 543705 cpu.go:275] no items to output this cycle
I0320 11:44:24.137679 543705 disk_info.go:125] begin check local disk info of client
I0320 11:44:24.140226 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:44:24.140232 543705 disk_info.go:196] parse disk info done, disk is : [0xc000289300 0xc000289340]
E0320 11:44:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:44:33.409805 543705 memory.go:184] no items to output this cycle
I0320 11:44:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 11:44:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:44:43.409786 543705 memory.go:191] Add success.
I0320 11:44:43.409805 543705 cpu.go:282] Add success.
I0320 11:44:43.419978 543705 net.go:648] Add success.
I0320 11:44:43.422636 543705 net.go:770] primary dev: ETH0
I0320 11:44:43.422649 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:44:43.422662 543705 net.go:698] Add success.
I0320 11:44:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:44:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:44:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:44:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:44:53.409795 543705 memory.go:184] no items to output this cycle
I0320 11:44:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 11:45:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:45:03.409766 543705 memory.go:184] no items to output this cycle
I0320 11:45:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 11:45:13.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:45:13.409777 543705 memory.go:191] Add success.
I0320 11:45:13.409789 543705 cpu.go:282] Add success.
W0320 11:45:13.409803 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:45:13.412367 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:45:13.412372 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:45:13.420073 543705 net.go:648] Add success.
I0320 11:45:13.421697 543705 net.go:770] primary dev: ETH0
I0320 11:45:13.421710 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:45:13.421723 543705 net.go:698] Add success.
I0320 11:45:13.464566 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d9546ef2-8b07-43ee-87c2-7c2f5a446147","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 11:45:13.464600 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 11:45:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:45:14.455130 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:45:14.455211 543705 disk_worker.go:708] disk space is not compliant
W0320 11:45:14.455214 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:45:14.456842 543705 disk_worker.go:494] system disk:vda1
I0320 11:45:14.456870 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:45:15.455623 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:45:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:45:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:45:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:45:16.472384 543705 disk_local_worker.go:436] Get disk info: []
I0320 11:45:23.409808 543705 cpu.go:275] no items to output this cycle
E0320 11:45:23.409810 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:45:23.409831 543705 memory.go:184] no items to output this cycle
I0320 11:45:24.141681 543705 disk_info.go:125] begin check local disk info of client
I0320 11:45:24.144228 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:45:24.144235 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007ba40 0xc00007ba80]
E0320 11:45:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:45:33.409770 543705 memory.go:184] no items to output this cycle
I0320 11:45:33.409817 543705 cpu.go:275] no items to output this cycle
I0320 11:45:38.456036 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 11:45:38.456042 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 11:45:43.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:45:43.410826 543705 memory.go:191] Add success.
I0320 11:45:43.409841 543705 cpu.go:282] Add success.
I0320 11:45:43.420670 543705 net.go:648] Add success.
I0320 11:45:43.423507 543705 net.go:770] primary dev: ETH0
I0320 11:45:43.423520 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:45:43.423532 543705 net.go:698] Add success.
I0320 11:45:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:45:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:45:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:45:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:45:53.409785 543705 cpu.go:275] no items to output this cycle
I0320 11:45:53.409791 543705 memory.go:184] no items to output this cycle
E0320 11:46:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:46:03.409769 543705 memory.go:184] no items to output this cycle
I0320 11:46:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 11:46:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:46:13.409781 543705 memory.go:191] Add success.
W0320 11:46:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 11:46:13.409816 543705 cpu.go:282] Add success.
W0320 11:46:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:46:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:46:13.420305 543705 net.go:648] Add success.
I0320 11:46:13.423228 543705 net.go:770] primary dev: ETH0
I0320 11:46:13.423241 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:46:13.423251 543705 net.go:698] Add success.
I0320 11:46:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:46:14.455144 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:46:14.455154 543705 disk_worker.go:708] disk space is not compliant
W0320 11:46:14.455157 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:46:14.456465 543705 disk_worker.go:494] system disk:vda1
I0320 11:46:14.456521 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:46:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:46:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:46:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:46:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:46:16.472384 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:46:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:46:23.409785 543705 memory.go:184] no items to output this cycle
I0320 11:46:23.409836 543705 cpu.go:275] no items to output this cycle
I0320 11:46:24.145683 543705 disk_info.go:125] begin check local disk info of client
I0320 11:46:24.148249 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:46:24.148255 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003cf340 0xc0003cf380]
E0320 11:46:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:46:33.409805 543705 memory.go:184] no items to output this cycle
I0320 11:46:33.409817 543705 cpu.go:275] no items to output this cycle
E0320 11:46:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:46:43.409801 543705 memory.go:191] Add success.
I0320 11:46:43.409805 543705 cpu.go:282] Add success.
I0320 11:46:43.419876 543705 net.go:648] Add success.
I0320 11:46:43.422533 543705 net.go:770] primary dev: ETH0
I0320 11:46:43.422561 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:46:43.422576 543705 net.go:698] Add success.
I0320 11:46:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:46:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:46:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:46:53.410233 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:46:53.410258 543705 memory.go:184] no items to output this cycle
I0320 11:46:53.410270 543705 cpu.go:275] no items to output this cycle
E0320 11:47:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:47:03.409796 543705 memory.go:184] no items to output this cycle
I0320 11:47:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 11:47:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:47:13.409812 543705 memory.go:191] Add success.
I0320 11:47:13.409827 543705 cpu.go:282] Add success.
W0320 11:47:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:47:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:47:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:47:13.419739 543705 net.go:648] Add success.
I0320 11:47:13.422467 543705 net.go:770] primary dev: ETH0
I0320 11:47:13.422481 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:47:13.422494 543705 net.go:698] Add success.
I0320 11:47:13.453036 543705 event_worker.go:152] Polling the log file for events...
W0320 11:47:14.455144 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:47:14.455154 543705 disk_worker.go:708] disk space is not compliant
W0320 11:47:14.455157 543705 disk_worker.go:728] disk inode is not compliant
E0320 11:47:14.456905 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 11:47:14.456915 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 11:47:14.456920 543705 custom_config.go:64] query custom config with name: gpu
I0320 11:47:14.456990 543705 disk_worker.go:494] system disk:vda1
I0320 11:47:14.457031 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 11:47:15.456868 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 11:47:15.456876 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:47:16.457931 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 11:47:16.457930 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 11:47:16.457986 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:47:16.458006 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:47:16.472350 543705 disk_local_worker.go:436] Get disk info: []
I0320 11:47:23.409811 543705 cpu.go:275] no items to output this cycle
E0320 11:47:23.409812 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:47:23.409832 543705 memory.go:184] no items to output this cycle
I0320 11:47:24.149683 543705 disk_info.go:125] begin check local disk info of client
I0320 11:47:24.152247 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:47:24.152255 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4940 0xc0000c4980]
E0320 11:47:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:47:33.409780 543705 memory.go:184] no items to output this cycle
I0320 11:47:33.409787 543705 cpu.go:275] no items to output this cycle
E0320 11:47:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:47:43.409800 543705 memory.go:191] Add success.
I0320 11:47:43.409800 543705 cpu.go:282] Add success.
I0320 11:47:43.419850 543705 net.go:648] Add success.
I0320 11:47:43.422715 543705 net.go:770] primary dev: ETH0
I0320 11:47:43.422730 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:47:43.422743 543705 net.go:698] Add success.
I0320 11:47:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:47:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:47:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:47:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:47:53.409793 543705 memory.go:184] no items to output this cycle
I0320 11:47:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 11:48:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:48:03.409904 543705 memory.go:184] no items to output this cycle
I0320 11:48:03.409915 543705 cpu.go:275] no items to output this cycle
E0320 11:48:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:48:13.409796 543705 memory.go:191] Add success.
I0320 11:48:13.409797 543705 cpu.go:282] Add success.
W0320 11:48:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:48:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:48:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:48:13.420231 543705 net.go:648] Add success.
I0320 11:48:13.422677 543705 net.go:770] primary dev: ETH0
I0320 11:48:13.422691 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:48:13.422706 543705 net.go:698] Add success.
I0320 11:48:13.464356 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f10a341f-ce05-4a3c-81f5-cbc165cf0f3a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 11:48:13.464389 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 11:48:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:48:14.455200 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:48:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0320 11:48:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:48:14.456607 543705 disk_worker.go:494] system disk:vda1
I0320 11:48:14.456637 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:48:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:48:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:48:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:48:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:48:16.472397 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:48:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:48:23.409801 543705 memory.go:184] no items to output this cycle
I0320 11:48:23.409812 543705 cpu.go:275] no items to output this cycle
I0320 11:48:24.153686 543705 disk_info.go:125] begin check local disk info of client
I0320 11:48:24.156285 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:48:24.156293 543705 disk_info.go:196] parse disk info done, disk is : [0xc000284b80 0xc000284bc0]
E0320 11:48:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:48:33.409801 543705 memory.go:184] no items to output this cycle
I0320 11:48:33.409817 543705 cpu.go:275] no items to output this cycle
I0320 11:48:38.457035 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 11:48:38.457042 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 11:48:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:48:43.410549 543705 memory.go:191] Add success.
I0320 11:48:43.409826 543705 cpu.go:282] Add success.
I0320 11:48:43.420240 543705 net.go:648] Add success.
I0320 11:48:43.422962 543705 net.go:770] primary dev: ETH0
I0320 11:48:43.422976 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:48:43.422991 543705 net.go:698] Add success.
I0320 11:48:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:48:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:48:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:48:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:48:53.409799 543705 memory.go:184] no items to output this cycle
I0320 11:48:53.409819 543705 cpu.go:275] no items to output this cycle
E0320 11:49:03.409895 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:49:03.409914 543705 memory.go:184] no items to output this cycle
I0320 11:49:03.409987 543705 cpu.go:275] no items to output this cycle
E0320 11:49:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:49:13.409819 543705 memory.go:191] Add success.
I0320 11:49:13.409828 543705 cpu.go:282] Add success.
W0320 11:49:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:49:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:49:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:49:13.420138 543705 net.go:648] Add success.
I0320 11:49:13.422783 543705 net.go:770] primary dev: ETH0
I0320 11:49:13.422796 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:49:13.422808 543705 net.go:698] Add success.
I0320 11:49:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:49:14.455194 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:49:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 11:49:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:49:14.456567 543705 disk_worker.go:494] system disk:vda1
I0320 11:49:14.456596 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:49:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:49:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:49:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:49:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:49:16.472373 543705 disk_local_worker.go:436] Get disk info: []
I0320 11:49:23.409785 543705 cpu.go:275] no items to output this cycle
E0320 11:49:23.409807 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:49:23.409825 543705 memory.go:184] no items to output this cycle
I0320 11:49:24.157823 543705 disk_info.go:125] begin check local disk info of client
I0320 11:49:24.160353 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:49:24.160361 543705 disk_info.go:196] parse disk info done, disk is : [0xc000367000 0xc000367040]
E0320 11:49:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:49:33.409794 543705 cpu.go:275] no items to output this cycle
I0320 11:49:33.409796 543705 memory.go:184] no items to output this cycle
E0320 11:49:43.409803 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:49:43.409839 543705 memory.go:191] Add success.
I0320 11:49:43.409844 543705 cpu.go:282] Add success.
I0320 11:49:43.420081 543705 net.go:648] Add success.
I0320 11:49:43.423365 543705 net.go:770] primary dev: ETH0
I0320 11:49:43.423379 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:49:43.423392 543705 net.go:698] Add success.
I0320 11:49:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:49:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:49:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:49:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:49:53.409779 543705 memory.go:184] no items to output this cycle
I0320 11:49:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 11:50:03.409858 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:50:03.409881 543705 memory.go:184] no items to output this cycle
I0320 11:50:03.409961 543705 cpu.go:275] no items to output this cycle
E0320 11:50:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:50:13.409792 543705 memory.go:191] Add success.
W0320 11:50:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 11:50:13.409818 543705 cpu.go:282] Add success.
W0320 11:50:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:50:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:50:13.420171 543705 net.go:648] Add success.
I0320 11:50:13.422750 543705 net.go:770] primary dev: ETH0
I0320 11:50:13.422776 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:50:13.422788 543705 net.go:698] Add success.
I0320 11:50:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:50:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:50:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0320 11:50:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:50:14.456524 543705 disk_worker.go:494] system disk:vda1
I0320 11:50:14.456569 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:50:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:50:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:50:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:50:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:50:16.472405 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:50:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:50:23.409779 543705 memory.go:184] no items to output this cycle
I0320 11:50:23.409841 543705 cpu.go:275] no items to output this cycle
I0320 11:50:24.161684 543705 disk_info.go:125] begin check local disk info of client
I0320 11:50:24.164830 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:50:24.164837 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4bc0 0xc0000c4c00]
E0320 11:50:33.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:50:33.409821 543705 memory.go:184] no items to output this cycle
I0320 11:50:33.409832 543705 cpu.go:275] no items to output this cycle
E0320 11:50:43.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:50:43.409835 543705 memory.go:191] Add success.
I0320 11:50:43.409844 543705 cpu.go:282] Add success.
I0320 11:50:43.419898 543705 net.go:648] Add success.
I0320 11:50:43.422947 543705 net.go:770] primary dev: ETH0
I0320 11:50:43.422976 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:50:43.422988 543705 net.go:698] Add success.
I0320 11:50:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:50:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:50:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:50:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:50:53.409769 543705 memory.go:184] no items to output this cycle
I0320 11:50:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 11:51:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:51:03.409775 543705 memory.go:184] no items to output this cycle
I0320 11:51:03.409802 543705 cpu.go:275] no items to output this cycle
E0320 11:51:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:51:13.409795 543705 memory.go:191] Add success.
I0320 11:51:13.409809 543705 cpu.go:282] Add success.
W0320 11:51:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:51:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:51:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:51:13.420140 543705 net.go:648] Add success.
I0320 11:51:13.423293 543705 net.go:770] primary dev: ETH0
I0320 11:51:13.423307 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:51:13.423320 543705 net.go:698] Add success.
I0320 11:51:13.469775 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"100b57d8-1617-49a9-881d-e14099a3da11","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 11:51:13.469808 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 11:51:14.454951 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:51:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:51:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0320 11:51:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:51:14.456518 543705 disk_worker.go:494] system disk:vda1
I0320 11:51:14.456570 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:51:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:51:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:51:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:51:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:51:16.472412 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:51:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:51:23.409802 543705 memory.go:184] no items to output this cycle
I0320 11:51:23.409815 543705 cpu.go:275] no items to output this cycle
I0320 11:51:24.165682 543705 disk_info.go:125] begin check local disk info of client
I0320 11:51:24.168238 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:51:24.168245 543705 disk_info.go:196] parse disk info done, disk is : [0xc000542780 0xc0005427c0]
E0320 11:51:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:51:33.409783 543705 memory.go:184] no items to output this cycle
I0320 11:51:33.409785 543705 cpu.go:275] no items to output this cycle
I0320 11:51:38.458040 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 11:51:38.458047 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 11:51:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:51:43.410774 543705 memory.go:191] Add success.
I0320 11:51:43.409811 543705 cpu.go:282] Add success.
I0320 11:51:43.420471 543705 net.go:648] Add success.
I0320 11:51:43.423568 543705 net.go:770] primary dev: ETH0
I0320 11:51:43.423582 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:51:43.423596 543705 net.go:698] Add success.
I0320 11:51:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:51:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:51:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:51:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:51:53.409799 543705 memory.go:184] no items to output this cycle
I0320 11:51:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 11:52:03.409879 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:52:03.409909 543705 memory.go:184] no items to output this cycle
I0320 11:52:03.409932 543705 cpu.go:275] no items to output this cycle
E0320 11:52:13.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:52:13.409775 543705 memory.go:191] Add success.
W0320 11:52:13.409802 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:52:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:52:13.409817 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:52:13.409824 543705 cpu.go:282] Add success.
I0320 11:52:13.420153 543705 net.go:648] Add success.
I0320 11:52:13.423354 543705 net.go:770] primary dev: ETH0
I0320 11:52:13.423374 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:52:13.423388 543705 net.go:698] Add success.
W0320 11:52:14.455167 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:52:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0320 11:52:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:52:14.456793 543705 disk_worker.go:494] system disk:vda1
I0320 11:52:14.456832 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 11:52:14.457118 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 11:52:14.457126 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 11:52:14.457130 543705 custom_config.go:64] query custom config with name: gpu
E0320 11:52:15.456773 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 11:52:15.456781 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:52:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 11:52:16.457977 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 11:52:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:52:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:52:16.472382 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:52:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:52:23.409782 543705 memory.go:184] no items to output this cycle
I0320 11:52:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 11:52:24.169680 543705 disk_info.go:125] begin check local disk info of client
I0320 11:52:24.172281 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:52:24.172289 543705 disk_info.go:196] parse disk info done, disk is : [0xc000473380 0xc0004733c0]
E0320 11:52:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:52:33.409813 543705 memory.go:184] no items to output this cycle
I0320 11:52:33.409828 543705 cpu.go:275] no items to output this cycle
E0320 11:52:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:52:43.409784 543705 memory.go:191] Add success.
I0320 11:52:43.409805 543705 cpu.go:282] Add success.
I0320 11:52:43.420084 543705 net.go:648] Add success.
I0320 11:52:43.422813 543705 net.go:770] primary dev: ETH0
I0320 11:52:43.422827 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:52:43.422840 543705 net.go:698] Add success.
I0320 11:52:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:52:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:52:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:52:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:52:53.409778 543705 memory.go:184] no items to output this cycle
I0320 11:52:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 11:53:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:53:03.409843 543705 memory.go:184] no items to output this cycle
I0320 11:53:03.409906 543705 cpu.go:275] no items to output this cycle
E0320 11:53:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:53:13.409822 543705 memory.go:191] Add success.
I0320 11:53:13.409830 543705 cpu.go:282] Add success.
W0320 11:53:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:53:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:53:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:53:13.420156 543705 net.go:648] Add success.
I0320 11:53:13.423368 543705 net.go:770] primary dev: ETH0
I0320 11:53:13.423383 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:53:13.423397 543705 net.go:698] Add success.
I0320 11:53:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:53:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:53:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 11:53:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:53:14.456582 543705 disk_worker.go:494] system disk:vda1
I0320 11:53:14.456612 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:53:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:53:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:53:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:53:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:53:16.472409 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:53:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:53:23.409795 543705 memory.go:184] no items to output this cycle
I0320 11:53:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 11:53:24.173679 543705 disk_info.go:125] begin check local disk info of client
I0320 11:53:24.176273 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:53:24.176280 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002857c0 0xc000285800]
E0320 11:53:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:53:33.409801 543705 memory.go:184] no items to output this cycle
I0320 11:53:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 11:53:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:53:43.409786 543705 memory.go:191] Add success.
I0320 11:53:43.409790 543705 cpu.go:282] Add success.
I0320 11:53:43.419978 543705 net.go:648] Add success.
I0320 11:53:43.420875 543705 net.go:770] primary dev: ETH0
I0320 11:53:43.420888 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:53:43.420902 543705 net.go:698] Add success.
I0320 11:53:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:53:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:53:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:53:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:53:53.409770 543705 memory.go:184] no items to output this cycle
I0320 11:53:53.409791 543705 cpu.go:275] no items to output this cycle
I0320 11:54:03.409895 543705 cpu.go:275] no items to output this cycle
E0320 11:54:03.409929 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:54:03.410044 543705 memory.go:184] no items to output this cycle
E0320 11:54:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:54:13.409777 543705 memory.go:191] Add success.
W0320 11:54:13.409803 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 11:54:13.409807 543705 cpu.go:282] Add success.
W0320 11:54:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:54:13.409816 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:54:13.420194 543705 net.go:648] Add success.
I0320 11:54:13.422970 543705 net.go:770] primary dev: ETH0
I0320 11:54:13.422983 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:54:13.423010 543705 net.go:698] Add success.
I0320 11:54:13.470515 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d08200df-97dd-4337-8d3c-150fb19376b5","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 11:54:13.470548 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 11:54:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:54:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:54:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 11:54:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:54:14.456695 543705 disk_worker.go:494] system disk:vda1
I0320 11:54:14.456723 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:54:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:54:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:54:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:54:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:54:16.472402 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:54:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:54:23.409802 543705 memory.go:184] no items to output this cycle
I0320 11:54:23.409808 543705 cpu.go:275] no items to output this cycle
I0320 11:54:24.177681 543705 disk_info.go:125] begin check local disk info of client
I0320 11:54:24.180247 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:54:24.180254 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035cb40 0xc00035cb80]
E0320 11:54:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:54:33.409785 543705 memory.go:184] no items to output this cycle
I0320 11:54:33.409801 543705 cpu.go:275] no items to output this cycle
I0320 11:54:38.459048 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 11:54:38.459055 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 11:54:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:54:43.410858 543705 memory.go:191] Add success.
I0320 11:54:43.409793 543705 cpu.go:282] Add success.
I0320 11:54:43.420610 543705 net.go:648] Add success.
I0320 11:54:43.423493 543705 net.go:770] primary dev: ETH0
I0320 11:54:43.423506 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:54:43.423521 543705 net.go:698] Add success.
I0320 11:54:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:54:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:54:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:54:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:54:53.409781 543705 memory.go:184] no items to output this cycle
I0320 11:54:53.409783 543705 cpu.go:275] no items to output this cycle
E0320 11:55:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:55:03.409796 543705 memory.go:184] no items to output this cycle
I0320 11:55:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 11:55:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:55:13.409789 543705 memory.go:191] Add success.
I0320 11:55:13.409810 543705 cpu.go:282] Add success.
W0320 11:55:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:55:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:55:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:55:13.420246 543705 net.go:648] Add success.
I0320 11:55:13.423088 543705 net.go:770] primary dev: ETH0
I0320 11:55:13.423100 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:55:13.423111 543705 net.go:698] Add success.
I0320 11:55:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:55:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:55:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 11:55:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:55:14.456565 543705 disk_worker.go:494] system disk:vda1
I0320 11:55:14.456594 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:55:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:55:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:55:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:55:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:55:16.472390 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:55:23.409742 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:55:23.409757 543705 memory.go:184] no items to output this cycle
I0320 11:55:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 11:55:24.181688 543705 disk_info.go:125] begin check local disk info of client
I0320 11:55:24.184251 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:55:24.184258 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001faa80 0xc0001faac0]
E0320 11:55:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:55:33.409800 543705 memory.go:184] no items to output this cycle
I0320 11:55:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 11:55:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:55:43.409798 543705 memory.go:191] Add success.
I0320 11:55:43.409802 543705 cpu.go:282] Add success.
I0320 11:55:43.419887 543705 net.go:648] Add success.
I0320 11:55:43.423146 543705 net.go:770] primary dev: ETH0
I0320 11:55:43.423159 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:55:43.423172 543705 net.go:698] Add success.
I0320 11:55:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:55:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:55:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:55:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:55:53.409771 543705 memory.go:184] no items to output this cycle
I0320 11:55:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 11:56:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:56:03.409801 543705 memory.go:184] no items to output this cycle
I0320 11:56:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 11:56:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:56:13.409813 543705 memory.go:191] Add success.
I0320 11:56:13.409822 543705 cpu.go:282] Add success.
W0320 11:56:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:56:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:56:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:56:13.420054 543705 net.go:648] Add success.
I0320 11:56:13.422624 543705 net.go:770] primary dev: ETH0
I0320 11:56:13.422638 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:56:13.422652 543705 net.go:698] Add success.
I0320 11:56:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:56:14.455116 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:56:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 11:56:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:56:14.456575 543705 disk_worker.go:494] system disk:vda1
I0320 11:56:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:56:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:56:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:56:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:56:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:56:16.472407 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:56:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:56:23.409795 543705 memory.go:184] no items to output this cycle
I0320 11:56:23.409805 543705 cpu.go:275] no items to output this cycle
I0320 11:56:24.185684 543705 disk_info.go:125] begin check local disk info of client
I0320 11:56:24.188223 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:56:24.188231 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bed00 0xc0002bed40]
E0320 11:56:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:56:33.409815 543705 memory.go:184] no items to output this cycle
I0320 11:56:33.409830 543705 cpu.go:275] no items to output this cycle
E0320 11:56:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:56:43.409787 543705 memory.go:191] Add success.
I0320 11:56:43.409809 543705 cpu.go:282] Add success.
I0320 11:56:43.420011 543705 net.go:648] Add success.
I0320 11:56:43.423560 543705 net.go:770] primary dev: ETH0
I0320 11:56:43.423575 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:56:43.423590 543705 net.go:698] Add success.
I0320 11:56:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:56:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:56:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:56:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:56:53.409774 543705 memory.go:184] no items to output this cycle
I0320 11:56:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 11:57:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:57:03.409767 543705 memory.go:184] no items to output this cycle
I0320 11:57:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 11:57:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:57:13.409783 543705 memory.go:191] Add success.
I0320 11:57:13.409796 543705 cpu.go:282] Add success.
W0320 11:57:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:57:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:57:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:57:13.420051 543705 net.go:648] Add success.
I0320 11:57:13.428490 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 11:57:13.428566 543705 net.go:770] primary dev: ETH0
I0320 11:57:13.428580 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:57:13.428594 543705 net.go:698] Add success.
I0320 11:57:13.453139 543705 event_worker.go:152] Polling the log file for events...
I0320 11:57:13.469054 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"03ca1274-9241-40bd-a20d-9f0286615121","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 11:57:13.469087 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 11:57:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:57:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 11:57:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:57:14.456846 543705 disk_worker.go:494] system disk:vda1
E0320 11:57:14.456869 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 11:57:14.456877 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 11:57:14.456882 543705 custom_config.go:64] query custom config with name: gpu
I0320 11:57:14.456901 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 11:57:15.456821 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 11:57:15.456829 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:57:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 11:57:16.457983 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 11:57:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:57:16.458042 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:57:16.472357 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:57:23.410367 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:57:23.410383 543705 memory.go:184] no items to output this cycle
I0320 11:57:23.410393 543705 cpu.go:275] no items to output this cycle
I0320 11:57:24.189694 543705 disk_info.go:125] begin check local disk info of client
I0320 11:57:24.192259 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:57:24.192266 543705 disk_info.go:196] parse disk info done, disk is : [0xc00033e2c0 0xc00033e300]
E0320 11:57:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:57:33.409778 543705 memory.go:184] no items to output this cycle
I0320 11:57:33.409806 543705 cpu.go:275] no items to output this cycle
I0320 11:57:38.460057 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 11:57:38.460064 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 11:57:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:57:43.410519 543705 memory.go:191] Add success.
I0320 11:57:43.409799 543705 cpu.go:282] Add success.
I0320 11:57:43.420214 543705 net.go:648] Add success.
I0320 11:57:43.423162 543705 net.go:770] primary dev: ETH0
I0320 11:57:43.423177 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:57:43.423190 543705 net.go:698] Add success.
I0320 11:57:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:57:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:57:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:57:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:57:53.409777 543705 memory.go:184] no items to output this cycle
I0320 11:57:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 11:58:03.409872 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:58:03.409899 543705 memory.go:184] no items to output this cycle
I0320 11:58:03.409898 543705 cpu.go:275] no items to output this cycle
E0320 11:58:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:58:13.409814 543705 memory.go:191] Add success.
I0320 11:58:13.409826 543705 cpu.go:282] Add success.
W0320 11:58:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:58:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:58:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:58:13.420104 543705 net.go:648] Add success.
I0320 11:58:13.422508 543705 net.go:770] primary dev: ETH0
I0320 11:58:13.422523 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:58:13.422537 543705 net.go:698] Add success.
I0320 11:58:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:58:14.455154 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:58:14.455164 543705 disk_worker.go:708] disk space is not compliant
W0320 11:58:14.455167 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:58:14.456506 543705 disk_worker.go:494] system disk:vda1
I0320 11:58:14.456550 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:58:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:58:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:58:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:58:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:58:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:58:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:58:23.409770 543705 memory.go:184] no items to output this cycle
I0320 11:58:23.409777 543705 cpu.go:275] no items to output this cycle
I0320 11:58:24.193696 543705 disk_info.go:125] begin check local disk info of client
I0320 11:58:24.196187 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:58:24.196195 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048a880 0xc00048a8c0]
E0320 11:58:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:58:33.409807 543705 memory.go:184] no items to output this cycle
I0320 11:58:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 11:58:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:58:43.409824 543705 memory.go:191] Add success.
I0320 11:58:43.409829 543705 cpu.go:282] Add success.
I0320 11:58:43.419982 543705 net.go:648] Add success.
I0320 11:58:43.422538 543705 net.go:770] primary dev: ETH0
I0320 11:58:43.422551 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:58:43.422564 543705 net.go:698] Add success.
I0320 11:58:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:58:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:58:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:58:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:58:53.409769 543705 memory.go:184] no items to output this cycle
I0320 11:58:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 11:59:03.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:59:03.409814 543705 memory.go:184] no items to output this cycle
I0320 11:59:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 11:59:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:59:13.409786 543705 memory.go:191] Add success.
I0320 11:59:13.409800 543705 cpu.go:282] Add success.
W0320 11:59:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 11:59:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 11:59:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 11:59:13.420060 543705 net.go:648] Add success.
I0320 11:59:13.422866 543705 net.go:770] primary dev: ETH0
I0320 11:59:13.422879 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:59:13.422891 543705 net.go:698] Add success.
I0320 11:59:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 11:59:14.455101 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 11:59:14.455164 543705 disk_worker.go:708] disk space is not compliant
W0320 11:59:14.455167 543705 disk_worker.go:728] disk inode is not compliant
I0320 11:59:14.456510 543705 disk_worker.go:494] system disk:vda1
I0320 11:59:14.456555 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 11:59:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 11:59:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:59:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:59:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 11:59:16.472376 543705 disk_local_worker.go:436] Get disk info: []
E0320 11:59:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:59:23.409803 543705 memory.go:184] no items to output this cycle
I0320 11:59:23.409814 543705 cpu.go:275] no items to output this cycle
I0320 11:59:24.197686 543705 disk_info.go:125] begin check local disk info of client
I0320 11:59:24.200079 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 11:59:24.200087 543705 disk_info.go:196] parse disk info done, disk is : [0xc000294900 0xc000294940]
E0320 11:59:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:59:33.409773 543705 memory.go:184] no items to output this cycle
I0320 11:59:33.409805 543705 cpu.go:275] no items to output this cycle
E0320 11:59:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:59:43.409801 543705 memory.go:191] Add success.
I0320 11:59:43.409802 543705 cpu.go:282] Add success.
I0320 11:59:43.420077 543705 net.go:648] Add success.
I0320 11:59:43.422993 543705 net.go:770] primary dev: ETH0
I0320 11:59:43.423007 543705 net.go:802] Send network stats successfully!,count is 6
I0320 11:59:43.423019 543705 net.go:698] Add success.
I0320 11:59:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 11:59:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 11:59:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 11:59:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 11:59:53.409775 543705 memory.go:184] no items to output this cycle
I0320 11:59:53.409778 543705 cpu.go:275] no items to output this cycle
E0320 12:00:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:00:03.409793 543705 memory.go:184] no items to output this cycle
I0320 12:00:03.409808 543705 cpu.go:275] no items to output this cycle
E0320 12:00:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:00:13.409797 543705 memory.go:191] Add success.
I0320 12:00:13.409800 543705 cpu.go:282] Add success.
W0320 12:00:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:00:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:00:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:00:13.420116 543705 net.go:648] Add success.
I0320 12:00:13.422802 543705 net.go:770] primary dev: ETH0
I0320 12:00:13.422816 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:00:13.422827 543705 net.go:698] Add success.
I0320 12:00:13.464062 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"627c75f3-53c1-4521-808f-0f68117d4818","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 12:00:13.464093 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 12:00:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:00:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:00:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 12:00:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:00:14.456578 543705 disk_worker.go:494] system disk:vda1
I0320 12:00:14.456606 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:00:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:00:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:00:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:00:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:00:16.472386 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:00:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:00:23.409762 543705 memory.go:184] no items to output this cycle
I0320 12:00:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 12:00:24.201694 543705 disk_info.go:125] begin check local disk info of client
I0320 12:00:24.204112 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:00:24.204120 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa440 0xc0001fa480]
E0320 12:00:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:00:33.409773 543705 memory.go:184] no items to output this cycle
I0320 12:00:33.409797 543705 cpu.go:275] no items to output this cycle
I0320 12:00:38.461055 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 12:00:38.461062 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 12:00:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:00:43.411059 543705 memory.go:191] Add success.
I0320 12:00:43.409813 543705 cpu.go:282] Add success.
I0320 12:00:43.419728 543705 net.go:648] Add success.
I0320 12:00:43.422270 543705 net.go:770] primary dev: ETH0
I0320 12:00:43.422285 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:00:43.422300 543705 net.go:698] Add success.
I0320 12:00:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:00:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:00:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:00:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:00:53.409774 543705 memory.go:184] no items to output this cycle
I0320 12:00:53.409779 543705 cpu.go:275] no items to output this cycle
E0320 12:01:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:01:03.409770 543705 memory.go:184] no items to output this cycle
I0320 12:01:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 12:01:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:01:13.409789 543705 memory.go:191] Add success.
I0320 12:01:13.409808 543705 cpu.go:282] Add success.
W0320 12:01:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:01:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:01:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:01:13.420217 543705 net.go:648] Add success.
I0320 12:01:13.422924 543705 net.go:770] primary dev: ETH0
I0320 12:01:13.422937 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:01:13.422949 543705 net.go:698] Add success.
I0320 12:01:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:01:14.455136 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:01:14.455148 543705 disk_worker.go:708] disk space is not compliant
W0320 12:01:14.455151 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:01:14.456493 543705 disk_worker.go:494] system disk:vda1
I0320 12:01:14.456534 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:01:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:01:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:01:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:01:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:01:16.472389 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:01:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:01:23.409767 543705 memory.go:184] no items to output this cycle
I0320 12:01:23.409789 543705 cpu.go:275] no items to output this cycle
I0320 12:01:24.205677 543705 disk_info.go:125] begin check local disk info of client
I0320 12:01:24.208129 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:01:24.208137 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb3c0 0xc0001fb400]
E0320 12:01:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:01:33.409778 543705 memory.go:184] no items to output this cycle
I0320 12:01:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 12:01:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:01:43.409802 543705 memory.go:191] Add success.
I0320 12:01:43.409820 543705 cpu.go:282] Add success.
I0320 12:01:43.419912 543705 net.go:648] Add success.
I0320 12:01:43.422660 543705 net.go:770] primary dev: ETH0
I0320 12:01:43.422675 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:01:43.422688 543705 net.go:698] Add success.
I0320 12:01:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:01:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:01:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:01:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:01:53.409804 543705 memory.go:184] no items to output this cycle
I0320 12:01:53.409817 543705 cpu.go:275] no items to output this cycle
E0320 12:02:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:02:03.409786 543705 memory.go:184] no items to output this cycle
I0320 12:02:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 12:02:13.409805 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:02:13.409838 543705 memory.go:191] Add success.
I0320 12:02:13.409845 543705 cpu.go:282] Add success.
W0320 12:02:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:02:13.409885 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:02:13.409889 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:02:13.420200 543705 net.go:648] Add success.
I0320 12:02:13.423212 543705 net.go:770] primary dev: ETH0
I0320 12:02:13.423227 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:02:13.423241 543705 net.go:698] Add success.
W0320 12:02:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:02:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0320 12:02:14.455192 543705 disk_worker.go:728] disk inode is not compliant
E0320 12:02:14.455929 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 12:02:14.455938 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 12:02:14.455944 543705 custom_config.go:64] query custom config with name: gpu
I0320 12:02:14.456559 543705 disk_worker.go:494] system disk:vda1
I0320 12:02:14.456589 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 12:02:15.456893 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 12:02:15.456903 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:02:16.457934 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 12:02:16.457944 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 12:02:16.457988 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:02:16.458007 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:02:16.472349 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:02:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:02:23.409784 543705 memory.go:184] no items to output this cycle
I0320 12:02:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 12:02:24.209684 543705 disk_info.go:125] begin check local disk info of client
I0320 12:02:24.212147 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:02:24.212155 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004f47c0 0xc0004f4800]
E0320 12:02:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:02:33.409781 543705 memory.go:184] no items to output this cycle
I0320 12:02:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 12:02:43.409808 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:02:43.409843 543705 memory.go:191] Add success.
I0320 12:02:43.409813 543705 cpu.go:282] Add success.
I0320 12:02:43.420104 543705 net.go:648] Add success.
I0320 12:02:43.421062 543705 net.go:770] primary dev: ETH0
I0320 12:02:43.421074 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:02:43.421092 543705 net.go:698] Add success.
I0320 12:02:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:02:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:02:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:02:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:02:53.409776 543705 memory.go:184] no items to output this cycle
I0320 12:02:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 12:03:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:03:03.409805 543705 memory.go:184] no items to output this cycle
I0320 12:03:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 12:03:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:03:13.409791 543705 memory.go:191] Add success.
I0320 12:03:13.409813 543705 cpu.go:282] Add success.
W0320 12:03:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:03:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:03:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:03:13.420246 543705 net.go:648] Add success.
I0320 12:03:13.423209 543705 net.go:770] primary dev: ETH0
I0320 12:03:13.423223 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:03:13.423234 543705 net.go:698] Add success.
I0320 12:03:13.468239 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0f7cc2ab-d590-4f29-b139-9a0b246966c6","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 12:03:13.468271 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 12:03:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:03:14.455211 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:03:14.455221 543705 disk_worker.go:708] disk space is not compliant
W0320 12:03:14.455224 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:03:14.456965 543705 disk_worker.go:494] system disk:vda1
I0320 12:03:14.457002 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:03:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:03:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:03:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:03:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:03:16.472379 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:03:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:03:23.409779 543705 memory.go:184] no items to output this cycle
I0320 12:03:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 12:03:24.213674 543705 disk_info.go:125] begin check local disk info of client
I0320 12:03:24.216041 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:03:24.216048 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004704c0 0xc000470500]
E0320 12:03:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:03:33.409775 543705 memory.go:184] no items to output this cycle
I0320 12:03:33.409806 543705 cpu.go:275] no items to output this cycle
I0320 12:03:38.462068 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 12:03:38.462075 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 12:03:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:03:43.410683 543705 memory.go:191] Add success.
I0320 12:03:43.409832 543705 cpu.go:282] Add success.
I0320 12:03:43.420401 543705 net.go:648] Add success.
I0320 12:03:43.423264 543705 net.go:770] primary dev: ETH0
I0320 12:03:43.423278 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:03:43.423293 543705 net.go:698] Add success.
I0320 12:03:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:03:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:03:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:03:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:03:53.409794 543705 memory.go:184] no items to output this cycle
I0320 12:03:53.409816 543705 cpu.go:275] no items to output this cycle
E0320 12:04:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:04:03.409776 543705 memory.go:184] no items to output this cycle
I0320 12:04:03.409778 543705 cpu.go:275] no items to output this cycle
E0320 12:04:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:04:13.409783 543705 memory.go:191] Add success.
I0320 12:04:13.409787 543705 cpu.go:282] Add success.
W0320 12:04:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:04:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:04:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:04:13.420059 543705 net.go:648] Add success.
I0320 12:04:13.422871 543705 net.go:770] primary dev: ETH0
I0320 12:04:13.422884 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:04:13.422897 543705 net.go:698] Add success.
I0320 12:04:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:04:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:04:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 12:04:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:04:14.456598 543705 disk_worker.go:494] system disk:vda1
I0320 12:04:14.456629 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:04:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:04:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:04:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:04:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:04:16.472399 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:04:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:04:23.409775 543705 memory.go:184] no items to output this cycle
I0320 12:04:23.409802 543705 cpu.go:275] no items to output this cycle
I0320 12:04:24.216132 543705 disk_info.go:125] begin check local disk info of client
I0320 12:04:24.218676 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:04:24.218685 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003824c0 0xc000382500]
E0320 12:04:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:04:33.409815 543705 memory.go:184] no items to output this cycle
I0320 12:04:33.409831 543705 cpu.go:275] no items to output this cycle
E0320 12:04:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:04:43.409787 543705 memory.go:191] Add success.
I0320 12:04:43.409813 543705 cpu.go:282] Add success.
I0320 12:04:43.420000 543705 net.go:648] Add success.
I0320 12:04:43.423038 543705 net.go:770] primary dev: ETH0
I0320 12:04:43.423051 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:04:43.423063 543705 net.go:698] Add success.
I0320 12:04:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:04:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:04:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:04:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:04:53.409775 543705 memory.go:184] no items to output this cycle
I0320 12:04:53.409783 543705 cpu.go:275] no items to output this cycle
E0320 12:05:03.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:05:03.409765 543705 memory.go:184] no items to output this cycle
I0320 12:05:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 12:05:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:05:13.409808 543705 memory.go:191] Add success.
I0320 12:05:13.409815 543705 cpu.go:282] Add success.
W0320 12:05:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:05:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:05:13.409860 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:05:13.420108 543705 net.go:648] Add success.
I0320 12:05:13.422655 543705 net.go:770] primary dev: ETH0
I0320 12:05:13.422669 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:05:13.422683 543705 net.go:698] Add success.
I0320 12:05:14.454987 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:05:14.455185 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:05:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 12:05:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:05:14.456574 543705 disk_worker.go:494] system disk:vda1
I0320 12:05:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:05:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:05:16.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:05:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:05:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:05:16.472404 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:05:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:05:23.409772 543705 memory.go:184] no items to output this cycle
I0320 12:05:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 12:05:24.218781 543705 disk_info.go:125] begin check local disk info of client
I0320 12:05:24.221297 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:05:24.221303 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a6300 0xc0002a6340]
E0320 12:05:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:05:33.409811 543705 memory.go:184] no items to output this cycle
I0320 12:05:33.409833 543705 cpu.go:275] no items to output this cycle
E0320 12:05:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:05:43.409810 543705 memory.go:191] Add success.
I0320 12:05:43.409811 543705 cpu.go:282] Add success.
I0320 12:05:43.420057 543705 net.go:648] Add success.
I0320 12:05:43.422871 543705 net.go:770] primary dev: ETH0
I0320 12:05:43.422884 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:05:43.422896 543705 net.go:698] Add success.
I0320 12:05:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:05:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:05:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:05:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:05:53.409775 543705 memory.go:184] no items to output this cycle
I0320 12:05:53.409779 543705 cpu.go:275] no items to output this cycle
E0320 12:06:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:06:03.409792 543705 memory.go:184] no items to output this cycle
I0320 12:06:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 12:06:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:06:13.409801 543705 cpu.go:282] Add success.
I0320 12:06:13.409806 543705 memory.go:191] Add success.
W0320 12:06:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:06:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:06:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:06:13.420082 543705 net.go:648] Add success.
I0320 12:06:13.422856 543705 net.go:770] primary dev: ETH0
I0320 12:06:13.422870 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:06:13.422882 543705 net.go:698] Add success.
I0320 12:06:13.468492 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7c0c633d-623b-4ec5-8ef6-ad52256bc1d8","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 12:06:13.468524 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 12:06:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:06:14.455233 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:06:14.455244 543705 disk_worker.go:708] disk space is not compliant
W0320 12:06:14.455247 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:06:14.456771 543705 disk_worker.go:494] system disk:vda1
I0320 12:06:14.456802 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:06:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:06:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:06:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:06:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:06:16.472423 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:06:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:06:23.409771 543705 memory.go:184] no items to output this cycle
I0320 12:06:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 12:06:24.221671 543705 disk_info.go:125] begin check local disk info of client
I0320 12:06:24.224217 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:06:24.224223 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007bc40 0xc00007bc80]
E0320 12:06:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:06:33.409774 543705 memory.go:184] no items to output this cycle
I0320 12:06:33.409809 543705 cpu.go:275] no items to output this cycle
I0320 12:06:38.463084 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 12:06:38.463091 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 12:06:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:06:43.410675 543705 memory.go:191] Add success.
I0320 12:06:43.409796 543705 cpu.go:282] Add success.
I0320 12:06:43.420407 543705 net.go:648] Add success.
I0320 12:06:43.423649 543705 net.go:770] primary dev: ETH0
I0320 12:06:43.423662 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:06:43.423675 543705 net.go:698] Add success.
I0320 12:06:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:06:46.458065 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:06:46.458092 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:06:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:06:53.409770 543705 memory.go:184] no items to output this cycle
I0320 12:06:53.409823 543705 cpu.go:275] no items to output this cycle
E0320 12:07:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:07:03.409798 543705 memory.go:184] no items to output this cycle
I0320 12:07:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 12:07:13.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:07:13.409775 543705 memory.go:191] Add success.
W0320 12:07:13.409800 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 12:07:13.409807 543705 cpu.go:282] Add success.
W0320 12:07:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:07:13.409814 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:07:13.420041 543705 net.go:648] Add success.
I0320 12:07:13.422886 543705 net.go:770] primary dev: ETH0
I0320 12:07:13.422900 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:07:13.422912 543705 net.go:698] Add success.
I0320 12:07:13.453531 543705 event_worker.go:152] Polling the log file for events...
W0320 12:07:14.455329 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:07:14.455425 543705 disk_worker.go:708] disk space is not compliant
W0320 12:07:14.455430 543705 disk_worker.go:728] disk inode is not compliant
E0320 12:07:14.456337 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 12:07:14.456346 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 12:07:14.456352 543705 custom_config.go:64] query custom config with name: gpu
I0320 12:07:14.457234 543705 disk_worker.go:494] system disk:vda1
I0320 12:07:14.457267 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 12:07:15.456809 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 12:07:15.456818 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:07:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 12:07:16.457982 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 12:07:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:07:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:07:16.472400 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:07:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:07:23.409775 543705 memory.go:184] no items to output this cycle
I0320 12:07:23.409800 543705 cpu.go:275] no items to output this cycle
I0320 12:07:24.225677 543705 disk_info.go:125] begin check local disk info of client
I0320 12:07:24.228165 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:07:24.228172 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007ba80 0xc00007bac0]
E0320 12:07:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:07:33.409807 543705 memory.go:184] no items to output this cycle
I0320 12:07:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 12:07:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:07:43.409792 543705 memory.go:191] Add success.
I0320 12:07:43.409812 543705 cpu.go:282] Add success.
I0320 12:07:43.420062 543705 net.go:648] Add success.
I0320 12:07:43.422889 543705 net.go:770] primary dev: ETH0
I0320 12:07:43.422903 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:07:43.422916 543705 net.go:698] Add success.
I0320 12:07:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:07:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:07:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:07:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:07:53.409771 543705 memory.go:184] no items to output this cycle
I0320 12:07:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 12:08:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:08:03.409795 543705 memory.go:184] no items to output this cycle
I0320 12:08:03.409809 543705 cpu.go:275] no items to output this cycle
E0320 12:08:13.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:08:13.409781 543705 memory.go:191] Add success.
W0320 12:08:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:08:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:08:13.409820 543705 cpu.go:282] Add success.
I0320 12:08:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:08:13.420224 543705 net.go:648] Add success.
I0320 12:08:13.422987 543705 net.go:770] primary dev: ETH0
I0320 12:08:13.423001 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:08:13.423014 543705 net.go:698] Add success.
I0320 12:08:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:08:14.455197 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:08:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0320 12:08:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:08:14.456830 543705 disk_worker.go:494] system disk:vda1
I0320 12:08:14.456859 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:08:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:08:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:08:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:08:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:08:16.472417 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:08:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:08:23.409809 543705 memory.go:184] no items to output this cycle
I0320 12:08:23.409817 543705 cpu.go:275] no items to output this cycle
I0320 12:08:24.229673 543705 disk_info.go:125] begin check local disk info of client
I0320 12:08:24.232171 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:08:24.232177 543705 disk_info.go:196] parse disk info done, disk is : [0xc000460d00 0xc000460d40]
E0320 12:08:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:08:33.409776 543705 memory.go:184] no items to output this cycle
I0320 12:08:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 12:08:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:08:43.409792 543705 memory.go:191] Add success.
I0320 12:08:43.409809 543705 cpu.go:282] Add success.
I0320 12:08:43.419988 543705 net.go:648] Add success.
I0320 12:08:43.422551 543705 net.go:770] primary dev: ETH0
I0320 12:08:43.422565 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:08:43.422577 543705 net.go:698] Add success.
I0320 12:08:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:08:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:08:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:08:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:08:53.409784 543705 memory.go:184] no items to output this cycle
I0320 12:08:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 12:09:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:09:03.409780 543705 memory.go:184] no items to output this cycle
I0320 12:09:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 12:09:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:09:13.409791 543705 memory.go:191] Add success.
I0320 12:09:13.409791 543705 cpu.go:282] Add success.
W0320 12:09:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:09:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:09:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:09:13.420215 543705 net.go:648] Add success.
I0320 12:09:13.423148 543705 net.go:770] primary dev: ETH0
I0320 12:09:13.423161 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:09:13.423175 543705 net.go:698] Add success.
I0320 12:09:13.465153 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7bab7df0-ef38-4e68-847e-e8897be76f47","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 12:09:13.465205 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 12:09:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:09:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:09:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0320 12:09:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:09:14.456610 543705 disk_worker.go:494] system disk:vda1
I0320 12:09:14.456639 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:09:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:09:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:09:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:09:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:09:16.472393 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:09:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:09:23.409787 543705 memory.go:184] no items to output this cycle
I0320 12:09:23.409788 543705 cpu.go:275] no items to output this cycle
I0320 12:09:24.233672 543705 disk_info.go:125] begin check local disk info of client
I0320 12:09:24.236133 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:09:24.236138 543705 disk_info.go:196] parse disk info done, disk is : [0xc000376c40 0xc000376c80]
E0320 12:09:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:09:33.409775 543705 memory.go:184] no items to output this cycle
I0320 12:09:33.409796 543705 cpu.go:275] no items to output this cycle
I0320 12:09:38.464071 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 12:09:38.464077 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 12:09:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:09:43.411065 543705 memory.go:191] Add success.
I0320 12:09:43.409825 543705 cpu.go:282] Add success.
I0320 12:09:43.419711 543705 net.go:648] Add success.
I0320 12:09:43.422846 543705 net.go:770] primary dev: ETH0
I0320 12:09:43.422859 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:09:43.422872 543705 net.go:698] Add success.
I0320 12:09:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:09:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:09:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:09:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:09:53.409798 543705 memory.go:184] no items to output this cycle
I0320 12:09:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 12:10:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:10:03.409779 543705 memory.go:184] no items to output this cycle
I0320 12:10:03.409782 543705 cpu.go:275] no items to output this cycle
E0320 12:10:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:10:13.409796 543705 memory.go:191] Add success.
I0320 12:10:13.409804 543705 cpu.go:282] Add success.
W0320 12:10:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:10:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:10:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:10:13.420108 543705 net.go:648] Add success.
I0320 12:10:13.422611 543705 net.go:770] primary dev: ETH0
I0320 12:10:13.422624 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:10:13.422635 543705 net.go:698] Add success.
I0320 12:10:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:10:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:10:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 12:10:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:10:14.456578 543705 disk_worker.go:494] system disk:vda1
I0320 12:10:14.456608 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:10:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:10:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:10:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:10:16.458077 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:10:16.472516 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:10:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:10:23.409813 543705 memory.go:184] no items to output this cycle
I0320 12:10:23.409844 543705 cpu.go:275] no items to output this cycle
I0320 12:10:24.237669 543705 disk_info.go:125] begin check local disk info of client
I0320 12:10:24.240187 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:10:24.240193 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa3c0 0xc0001aa400]
E0320 12:10:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:10:33.409809 543705 memory.go:184] no items to output this cycle
I0320 12:10:33.409823 543705 cpu.go:275] no items to output this cycle
E0320 12:10:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:10:43.409776 543705 memory.go:191] Add success.
I0320 12:10:43.409812 543705 cpu.go:282] Add success.
I0320 12:10:43.419869 543705 net.go:648] Add success.
I0320 12:10:43.422571 543705 net.go:770] primary dev: ETH0
I0320 12:10:43.422584 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:10:43.422597 543705 net.go:698] Add success.
I0320 12:10:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:10:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:10:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:10:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:10:53.409769 543705 memory.go:184] no items to output this cycle
I0320 12:10:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 12:11:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:11:03.409781 543705 memory.go:184] no items to output this cycle
I0320 12:11:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 12:11:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:11:13.409796 543705 memory.go:191] Add success.
I0320 12:11:13.409799 543705 cpu.go:282] Add success.
W0320 12:11:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:11:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:11:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:11:13.420103 543705 net.go:648] Add success.
I0320 12:11:13.423003 543705 net.go:770] primary dev: ETH0
I0320 12:11:13.423015 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:11:13.423027 543705 net.go:698] Add success.
I0320 12:11:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:11:14.455134 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:11:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 12:11:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:11:14.456631 543705 disk_worker.go:494] system disk:vda1
I0320 12:11:14.456663 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:11:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:11:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:11:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:11:16.458074 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:11:16.472453 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:11:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:11:23.409778 543705 memory.go:184] no items to output this cycle
I0320 12:11:23.409782 543705 cpu.go:275] no items to output this cycle
I0320 12:11:24.240278 543705 disk_info.go:125] begin check local disk info of client
I0320 12:11:24.242809 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:11:24.242815 543705 disk_info.go:196] parse disk info done, disk is : [0xc000485280 0xc0004852c0]
E0320 12:11:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:11:33.409814 543705 memory.go:184] no items to output this cycle
I0320 12:11:33.409834 543705 cpu.go:275] no items to output this cycle
E0320 12:11:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:11:43.409811 543705 memory.go:191] Add success.
I0320 12:11:43.409811 543705 cpu.go:282] Add success.
I0320 12:11:43.419857 543705 net.go:770] primary dev: ETH0
I0320 12:11:43.419870 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:11:43.419882 543705 net.go:698] Add success.
I0320 12:11:43.420243 543705 net.go:648] Add success.
I0320 12:11:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:11:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:11:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:11:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:11:53.409772 543705 memory.go:184] no items to output this cycle
I0320 12:11:53.409779 543705 cpu.go:275] no items to output this cycle
E0320 12:12:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:12:03.409781 543705 memory.go:184] no items to output this cycle
I0320 12:12:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 12:12:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:12:13.409775 543705 memory.go:191] Add success.
W0320 12:12:13.409799 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:12:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:12:13.409815 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:12:13.409821 543705 cpu.go:282] Add success.
I0320 12:12:13.420037 543705 net.go:770] primary dev: ETH0
I0320 12:12:13.420051 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:12:13.420063 543705 net.go:698] Add success.
I0320 12:12:13.420404 543705 net.go:648] Add success.
I0320 12:12:13.627678 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a5ca21b7-b309-4819-b3a6-dfc604dd0c18","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 12:12:13.627714 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 12:12:14.454906 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:12:14.454970 543705 disk_worker.go:708] disk space is not compliant
W0320 12:12:14.454973 543705 disk_worker.go:728] disk inode is not compliant
E0320 12:12:14.455704 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 12:12:14.455713 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 12:12:14.455719 543705 custom_config.go:64] query custom config with name: gpu
I0320 12:12:14.456400 543705 disk_worker.go:494] system disk:vda1
I0320 12:12:14.456449 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 12:12:15.456836 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 12:12:15.456845 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:12:16.457940 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 12:12:16.457940 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 12:12:16.457995 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:12:16.458018 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:12:16.472338 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:12:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:12:23.409802 543705 memory.go:184] no items to output this cycle
I0320 12:12:23.409813 543705 cpu.go:275] no items to output this cycle
I0320 12:12:24.242900 543705 disk_info.go:125] begin check local disk info of client
I0320 12:12:24.245323 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:12:24.245329 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034e040 0xc00034e080]
E0320 12:12:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:12:33.409790 543705 memory.go:184] no items to output this cycle
I0320 12:12:33.409800 543705 cpu.go:275] no items to output this cycle
I0320 12:12:38.465096 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 12:12:38.465102 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 12:12:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:12:43.410548 543705 memory.go:191] Add success.
I0320 12:12:43.409822 543705 cpu.go:282] Add success.
I0320 12:12:43.420053 543705 net.go:770] primary dev: ETH0
I0320 12:12:43.420072 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:12:43.420088 543705 net.go:698] Add success.
I0320 12:12:43.420469 543705 net.go:648] Add success.
I0320 12:12:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:12:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:12:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:12:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:12:53.409769 543705 memory.go:184] no items to output this cycle
I0320 12:12:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 12:13:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:13:03.409802 543705 memory.go:184] no items to output this cycle
I0320 12:13:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 12:13:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:13:13.409809 543705 memory.go:191] Add success.
I0320 12:13:13.409817 543705 cpu.go:282] Add success.
W0320 12:13:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:13:13.412608 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:13:13.412614 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:13:13.420317 543705 net.go:648] Add success.
I0320 12:13:13.422088 543705 net.go:770] primary dev: ETH0
I0320 12:13:13.422101 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:13:13.422115 543705 net.go:698] Add success.
I0320 12:13:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:13:14.455197 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:13:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0320 12:13:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:13:14.456765 543705 disk_worker.go:494] system disk:vda1
I0320 12:13:14.456798 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:13:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:13:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:13:16.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:13:16.458079 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:13:16.472426 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:13:23.409870 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:13:23.409885 543705 cpu.go:275] no items to output this cycle
I0320 12:13:23.409889 543705 memory.go:184] no items to output this cycle
I0320 12:13:24.245673 543705 disk_info.go:125] begin check local disk info of client
I0320 12:13:24.248071 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:13:24.248077 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002ba000 0xc0002ba040]
E0320 12:13:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:13:33.409815 543705 memory.go:184] no items to output this cycle
I0320 12:13:33.409826 543705 cpu.go:275] no items to output this cycle
E0320 12:13:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:13:43.409793 543705 memory.go:191] Add success.
I0320 12:13:43.409811 543705 cpu.go:282] Add success.
I0320 12:13:43.420008 543705 net.go:648] Add success.
I0320 12:13:43.422734 543705 net.go:770] primary dev: ETH0
I0320 12:13:43.422749 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:13:43.422764 543705 net.go:698] Add success.
I0320 12:13:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:13:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:13:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:13:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:13:53.409794 543705 memory.go:184] no items to output this cycle
I0320 12:13:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 12:14:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:14:03.409765 543705 memory.go:184] no items to output this cycle
I0320 12:14:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 12:14:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:14:13.409803 543705 memory.go:191] Add success.
W0320 12:14:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:14:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:14:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:14:13.409863 543705 cpu.go:282] Add success.
I0320 12:14:13.420382 543705 net.go:648] Add success.
I0320 12:14:13.423194 543705 net.go:770] primary dev: ETH0
I0320 12:14:13.423207 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:14:13.423220 543705 net.go:698] Add success.
I0320 12:14:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:14:14.455201 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:14:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0320 12:14:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:14:14.456611 543705 disk_worker.go:494] system disk:vda1
I0320 12:14:14.456647 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:14:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:14:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:14:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:14:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:14:16.472389 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:14:23.409871 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:14:23.409898 543705 memory.go:184] no items to output this cycle
I0320 12:14:23.409944 543705 cpu.go:275] no items to output this cycle
I0320 12:14:24.249671 543705 disk_info.go:125] begin check local disk info of client
I0320 12:14:24.252077 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:14:24.252082 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002be080 0xc0002be0c0]
E0320 12:14:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:14:33.409779 543705 memory.go:184] no items to output this cycle
I0320 12:14:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 12:14:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:14:43.409790 543705 memory.go:191] Add success.
I0320 12:14:43.409799 543705 cpu.go:282] Add success.
I0320 12:14:43.419937 543705 net.go:648] Add success.
I0320 12:14:43.422453 543705 net.go:770] primary dev: ETH0
I0320 12:14:43.422465 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:14:43.422477 543705 net.go:698] Add success.
I0320 12:14:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:14:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:14:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:14:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:14:53.409784 543705 memory.go:184] no items to output this cycle
I0320 12:14:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 12:15:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:15:03.409797 543705 memory.go:184] no items to output this cycle
I0320 12:15:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 12:15:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:15:13.409787 543705 memory.go:191] Add success.
I0320 12:15:13.409807 543705 cpu.go:282] Add success.
W0320 12:15:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:15:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:15:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:15:13.420151 543705 net.go:648] Add success.
I0320 12:15:13.423272 543705 net.go:770] primary dev: ETH0
I0320 12:15:13.423287 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:15:13.423302 543705 net.go:698] Add success.
I0320 12:15:13.467669 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"85cc3622-cf3b-4553-9133-4204c2b94502","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 12:15:13.467701 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 12:15:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:15:14.455138 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:15:14.455218 543705 disk_worker.go:708] disk space is not compliant
W0320 12:15:14.455222 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:15:14.456612 543705 disk_worker.go:494] system disk:vda1
I0320 12:15:14.456643 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:15:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:15:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:15:16.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:15:16.458079 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:15:16.472418 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:15:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:15:23.409799 543705 memory.go:184] no items to output this cycle
I0320 12:15:23.409811 543705 cpu.go:275] no items to output this cycle
I0320 12:15:24.253673 543705 disk_info.go:125] begin check local disk info of client
I0320 12:15:24.256171 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:15:24.256176 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fbe40 0xc0001fbe80]
E0320 12:15:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:15:33.409812 543705 memory.go:184] no items to output this cycle
I0320 12:15:33.409826 543705 cpu.go:275] no items to output this cycle
I0320 12:15:38.466086 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 12:15:38.466092 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 12:15:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:15:43.410871 543705 memory.go:191] Add success.
I0320 12:15:43.409825 543705 cpu.go:282] Add success.
I0320 12:15:43.420431 543705 net.go:770] primary dev: ETH0
I0320 12:15:43.420450 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:15:43.420465 543705 net.go:698] Add success.
I0320 12:15:43.420822 543705 net.go:648] Add success.
I0320 12:15:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:15:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:15:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:15:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:15:53.409765 543705 memory.go:184] no items to output this cycle
I0320 12:15:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 12:16:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:16:03.409769 543705 memory.go:184] no items to output this cycle
I0320 12:16:03.409798 543705 cpu.go:275] no items to output this cycle
W0320 12:16:13.409711 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:16:13.409729 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:16:13.409733 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:16:13.409798 543705 cpu.go:282] Add success.
E0320 12:16:13.409817 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:16:13.409840 543705 memory.go:191] Add success.
I0320 12:16:13.420162 543705 net.go:648] Add success.
I0320 12:16:13.422995 543705 net.go:770] primary dev: ETH0
I0320 12:16:13.423008 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:16:13.423021 543705 net.go:698] Add success.
I0320 12:16:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:16:14.455132 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:16:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 12:16:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:16:14.456603 543705 disk_worker.go:494] system disk:vda1
I0320 12:16:14.456632 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:16:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:16:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:16:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:16:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:16:16.472438 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:16:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:16:23.409788 543705 memory.go:184] no items to output this cycle
I0320 12:16:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 12:16:24.257669 543705 disk_info.go:125] begin check local disk info of client
I0320 12:16:24.260172 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:16:24.260178 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000e9240 0xc0000e9280]
E0320 12:16:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:16:33.409785 543705 memory.go:184] no items to output this cycle
I0320 12:16:33.409791 543705 cpu.go:275] no items to output this cycle
E0320 12:16:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:16:43.409792 543705 memory.go:191] Add success.
I0320 12:16:43.409798 543705 cpu.go:282] Add success.
I0320 12:16:43.420091 543705 net.go:648] Add success.
I0320 12:16:43.422957 543705 net.go:770] primary dev: ETH0
I0320 12:16:43.422970 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:16:43.422982 543705 net.go:698] Add success.
I0320 12:16:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:16:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:16:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:16:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:16:53.409811 543705 memory.go:184] no items to output this cycle
I0320 12:16:53.409814 543705 cpu.go:275] no items to output this cycle
E0320 12:17:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:17:03.409781 543705 memory.go:184] no items to output this cycle
I0320 12:17:03.409784 543705 cpu.go:275] no items to output this cycle
E0320 12:17:13.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:17:13.409834 543705 memory.go:191] Add success.
I0320 12:17:13.409838 543705 cpu.go:282] Add success.
W0320 12:17:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:17:13.409881 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:17:13.409885 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:17:13.420148 543705 net.go:648] Add success.
I0320 12:17:13.423136 543705 net.go:770] primary dev: ETH0
I0320 12:17:13.423151 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:17:13.423186 543705 net.go:698] Add success.
I0320 12:17:13.452795 543705 event_worker.go:152] Polling the log file for events...
W0320 12:17:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:17:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0320 12:17:14.455171 543705 disk_worker.go:728] disk inode is not compliant
E0320 12:17:14.456935 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 12:17:14.456944 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 12:17:14.456950 543705 custom_config.go:64] query custom config with name: gpu
I0320 12:17:14.457001 543705 disk_worker.go:494] system disk:vda1
I0320 12:17:14.457046 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 12:17:15.456798 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 12:17:15.456806 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:17:16.457908 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 12:17:16.457905 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 12:17:16.457963 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:17:16.457981 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:17:16.472304 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:17:23.409880 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:17:23.409900 543705 memory.go:184] no items to output this cycle
I0320 12:17:23.409942 543705 cpu.go:275] no items to output this cycle
I0320 12:17:24.261671 543705 disk_info.go:125] begin check local disk info of client
I0320 12:17:24.264097 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:17:24.264103 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4100 0xc0000c4140]
E0320 12:17:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:17:33.409777 543705 memory.go:184] no items to output this cycle
I0320 12:17:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 12:17:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:17:43.409820 543705 memory.go:191] Add success.
I0320 12:17:43.409825 543705 cpu.go:282] Add success.
I0320 12:17:43.419978 543705 net.go:648] Add success.
I0320 12:17:43.422963 543705 net.go:770] primary dev: ETH0
I0320 12:17:43.422978 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:17:43.422995 543705 net.go:698] Add success.
I0320 12:17:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:17:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:17:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:17:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:17:53.409777 543705 cpu.go:275] no items to output this cycle
I0320 12:17:53.409791 543705 memory.go:184] no items to output this cycle
E0320 12:18:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:18:03.409778 543705 memory.go:184] no items to output this cycle
I0320 12:18:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 12:18:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:18:13.409813 543705 memory.go:191] Add success.
I0320 12:18:13.409823 543705 cpu.go:282] Add success.
W0320 12:18:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:18:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:18:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:18:13.420142 543705 net.go:648] Add success.
I0320 12:18:13.423301 543705 net.go:770] primary dev: ETH0
I0320 12:18:13.423315 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:18:13.423326 543705 net.go:698] Add success.
I0320 12:18:13.955667 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8d228090-a625-4e01-a957-1acdf7897eae","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 12:18:13.955704 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 12:18:14.454168 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:18:14.454393 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:18:14.454403 543705 disk_worker.go:708] disk space is not compliant
W0320 12:18:14.454406 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:18:14.455933 543705 disk_worker.go:494] system disk:vda1
I0320 12:18:14.455966 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:18:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:18:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:18:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:18:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:18:16.472385 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:18:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:18:23.409784 543705 memory.go:184] no items to output this cycle
I0320 12:18:23.409798 543705 cpu.go:275] no items to output this cycle
I0320 12:18:24.265667 543705 disk_info.go:125] begin check local disk info of client
I0320 12:18:24.268110 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:18:24.268116 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047fa00 0xc00047fa40]
E0320 12:18:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:18:33.409787 543705 memory.go:184] no items to output this cycle
I0320 12:18:33.409817 543705 cpu.go:275] no items to output this cycle
I0320 12:18:38.467102 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 12:18:38.467110 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 12:18:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:18:43.410680 543705 memory.go:191] Add success.
I0320 12:18:43.409828 543705 cpu.go:282] Add success.
I0320 12:18:43.420385 543705 net.go:648] Add success.
I0320 12:18:43.423577 543705 net.go:770] primary dev: ETH0
I0320 12:18:43.423590 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:18:43.423603 543705 net.go:698] Add success.
I0320 12:18:46.457997 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:18:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:18:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:18:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:18:53.409779 543705 memory.go:184] no items to output this cycle
I0320 12:18:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 12:19:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:19:03.409806 543705 memory.go:184] no items to output this cycle
I0320 12:19:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 12:19:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:19:13.409799 543705 memory.go:191] Add success.
I0320 12:19:13.409814 543705 cpu.go:282] Add success.
W0320 12:19:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:19:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:19:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:19:13.420133 543705 net.go:648] Add success.
I0320 12:19:13.423183 543705 net.go:770] primary dev: ETH0
I0320 12:19:13.423194 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:19:13.423206 543705 net.go:698] Add success.
I0320 12:19:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:19:14.455219 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:19:14.455231 543705 disk_worker.go:708] disk space is not compliant
W0320 12:19:14.455233 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:19:14.456607 543705 disk_worker.go:494] system disk:vda1
I0320 12:19:14.456636 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:19:15.456024 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:19:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:19:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:19:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:19:16.472368 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:19:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:19:23.409788 543705 memory.go:184] no items to output this cycle
I0320 12:19:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 12:19:24.269671 543705 disk_info.go:125] begin check local disk info of client
I0320 12:19:24.272135 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:19:24.272141 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb5c0 0xc0001fb600]
E0320 12:19:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:19:33.409790 543705 memory.go:184] no items to output this cycle
I0320 12:19:33.409794 543705 cpu.go:275] no items to output this cycle
E0320 12:19:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:19:43.409828 543705 memory.go:191] Add success.
I0320 12:19:43.409836 543705 cpu.go:282] Add success.
I0320 12:19:43.419908 543705 net.go:648] Add success.
I0320 12:19:43.422872 543705 net.go:770] primary dev: ETH0
I0320 12:19:43.422886 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:19:43.422900 543705 net.go:698] Add success.
I0320 12:19:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:19:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:19:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:19:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:19:53.409781 543705 memory.go:184] no items to output this cycle
I0320 12:19:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 12:20:03.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:20:03.409770 543705 memory.go:184] no items to output this cycle
I0320 12:20:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 12:20:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:20:13.409808 543705 memory.go:191] Add success.
I0320 12:20:13.409811 543705 cpu.go:282] Add success.
W0320 12:20:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:20:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:20:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:20:13.420083 543705 net.go:648] Add success.
I0320 12:20:13.422693 543705 net.go:770] primary dev: ETH0
I0320 12:20:13.422707 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:20:13.422721 543705 net.go:698] Add success.
I0320 12:20:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:20:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:20:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 12:20:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:20:14.456507 543705 disk_worker.go:494] system disk:vda1
I0320 12:20:14.456552 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:20:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:20:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:20:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:20:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:20:16.472401 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:20:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:20:23.409804 543705 memory.go:184] no items to output this cycle
I0320 12:20:23.409816 543705 cpu.go:275] no items to output this cycle
I0320 12:20:24.273672 543705 disk_info.go:125] begin check local disk info of client
I0320 12:20:24.276242 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:20:24.276248 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a5c80 0xc0002a5cc0]
E0320 12:20:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:20:33.409801 543705 cpu.go:275] no items to output this cycle
I0320 12:20:33.409809 543705 memory.go:184] no items to output this cycle
E0320 12:20:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:20:43.409786 543705 memory.go:191] Add success.
I0320 12:20:43.409818 543705 cpu.go:282] Add success.
I0320 12:20:43.420046 543705 net.go:648] Add success.
I0320 12:20:43.422689 543705 net.go:770] primary dev: ETH0
I0320 12:20:43.422702 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:20:43.422715 543705 net.go:698] Add success.
I0320 12:20:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:20:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:20:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:20:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:20:53.409796 543705 memory.go:184] no items to output this cycle
I0320 12:20:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 12:21:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:21:03.409779 543705 memory.go:184] no items to output this cycle
I0320 12:21:03.409780 543705 cpu.go:275] no items to output this cycle
E0320 12:21:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:21:13.409797 543705 memory.go:191] Add success.
I0320 12:21:13.409801 543705 cpu.go:282] Add success.
W0320 12:21:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:21:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:21:13.409853 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:21:13.420134 543705 net.go:770] primary dev: ETH0
I0320 12:21:13.420147 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:21:13.420160 543705 net.go:698] Add success.
I0320 12:21:13.420394 543705 net.go:648] Add success.
I0320 12:21:13.468951 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"60530d1b-b7a7-46b2-aac9-de21056f550a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 12:21:13.468985 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 12:21:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:21:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:21:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 12:21:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:21:14.456862 543705 disk_worker.go:494] system disk:vda1
I0320 12:21:14.456893 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:21:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:21:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:21:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:21:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:21:16.472400 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:21:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:21:23.409777 543705 memory.go:184] no items to output this cycle
I0320 12:21:23.409780 543705 cpu.go:275] no items to output this cycle
I0320 12:21:24.277669 543705 disk_info.go:125] begin check local disk info of client
I0320 12:21:24.280104 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:21:24.280109 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa080 0xc0001fa0c0]
E0320 12:21:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:21:33.409784 543705 memory.go:184] no items to output this cycle
I0320 12:21:33.409798 543705 cpu.go:275] no items to output this cycle
I0320 12:21:38.468091 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 12:21:38.468098 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 12:21:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:21:43.410656 543705 memory.go:191] Add success.
I0320 12:21:43.409811 543705 cpu.go:282] Add success.
I0320 12:21:43.420195 543705 net.go:770] primary dev: ETH0
I0320 12:21:43.420208 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:21:43.420221 543705 net.go:698] Add success.
I0320 12:21:43.420585 543705 net.go:648] Add success.
I0320 12:21:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:21:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:21:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:21:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:21:53.409780 543705 memory.go:184] no items to output this cycle
I0320 12:21:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 12:22:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:22:03.409768 543705 memory.go:184] no items to output this cycle
I0320 12:22:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 12:22:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:22:13.409794 543705 memory.go:191] Add success.
I0320 12:22:13.409799 543705 cpu.go:282] Add success.
W0320 12:22:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:22:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:22:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:22:13.420064 543705 net.go:648] Add success.
I0320 12:22:13.422739 543705 net.go:770] primary dev: ETH0
I0320 12:22:13.422752 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:22:13.422765 543705 net.go:698] Add success.
W0320 12:22:14.455382 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:22:14.455477 543705 disk_worker.go:708] disk space is not compliant
W0320 12:22:14.455482 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:22:14.457508 543705 disk_worker.go:494] system disk:vda1
E0320 12:22:14.457588 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 12:22:14.457597 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 12:22:14.457603 543705 custom_config.go:64] query custom config with name: gpu
I0320 12:22:14.457605 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 12:22:15.456801 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 12:22:15.456810 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:22:16.457959 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 12:22:16.457968 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 12:22:16.458009 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:22:16.458027 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:22:16.472404 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:22:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:22:23.409765 543705 memory.go:184] no items to output this cycle
I0320 12:22:23.409798 543705 cpu.go:275] no items to output this cycle
I0320 12:22:24.281673 543705 disk_info.go:125] begin check local disk info of client
I0320 12:22:24.284112 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:22:24.284119 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003c1840 0xc0003c1880]
E0320 12:22:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:22:33.409774 543705 memory.go:184] no items to output this cycle
I0320 12:22:33.409797 543705 cpu.go:275] no items to output this cycle
E0320 12:22:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:22:43.409784 543705 memory.go:191] Add success.
I0320 12:22:43.409809 543705 cpu.go:282] Add success.
I0320 12:22:43.420285 543705 net.go:648] Add success.
I0320 12:22:43.422930 543705 net.go:770] primary dev: ETH0
I0320 12:22:43.422943 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:22:43.422967 543705 net.go:698] Add success.
I0320 12:22:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:22:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:22:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:22:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:22:53.409770 543705 memory.go:184] no items to output this cycle
I0320 12:22:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 12:23:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:23:03.409797 543705 memory.go:184] no items to output this cycle
I0320 12:23:03.409809 543705 cpu.go:275] no items to output this cycle
E0320 12:23:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:23:13.409787 543705 memory.go:191] Add success.
I0320 12:23:13.409810 543705 cpu.go:282] Add success.
W0320 12:23:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:23:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:23:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:23:13.420144 543705 net.go:648] Add success.
I0320 12:23:13.422868 543705 net.go:770] primary dev: ETH0
I0320 12:23:13.422880 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:23:13.422892 543705 net.go:698] Add success.
I0320 12:23:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:23:14.455458 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:23:14.455471 543705 disk_worker.go:708] disk space is not compliant
W0320 12:23:14.455475 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:23:14.457074 543705 disk_worker.go:494] system disk:vda1
I0320 12:23:14.457103 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:23:15.455949 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:23:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:23:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:23:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:23:16.472378 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:23:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:23:23.409800 543705 memory.go:184] no items to output this cycle
I0320 12:23:23.409811 543705 cpu.go:275] no items to output this cycle
I0320 12:23:24.285669 543705 disk_info.go:125] begin check local disk info of client
I0320 12:23:24.288167 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:23:24.288174 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047a480 0xc00047a4c0]
E0320 12:23:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:23:33.409780 543705 memory.go:184] no items to output this cycle
I0320 12:23:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 12:23:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:23:43.409822 543705 memory.go:191] Add success.
I0320 12:23:43.409831 543705 cpu.go:282] Add success.
I0320 12:23:43.419947 543705 net.go:648] Add success.
I0320 12:23:43.422933 543705 net.go:770] primary dev: ETH0
I0320 12:23:43.422946 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:23:43.422959 543705 net.go:698] Add success.
I0320 12:23:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:23:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:23:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:23:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:23:53.409774 543705 memory.go:184] no items to output this cycle
I0320 12:23:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 12:24:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:24:03.409772 543705 memory.go:184] no items to output this cycle
I0320 12:24:03.409802 543705 cpu.go:275] no items to output this cycle
E0320 12:24:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:24:13.409817 543705 memory.go:191] Add success.
I0320 12:24:13.409825 543705 cpu.go:282] Add success.
W0320 12:24:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:24:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:24:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:24:13.420046 543705 net.go:648] Add success.
I0320 12:24:13.422810 543705 net.go:770] primary dev: ETH0
I0320 12:24:13.422824 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:24:13.422837 543705 net.go:698] Add success.
I0320 12:24:13.469118 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"093d2a7e-d2de-4077-a30e-3bb39c34e3ff","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 12:24:13.469151 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 12:24:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:24:14.455320 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:24:14.455333 543705 disk_worker.go:708] disk space is not compliant
W0320 12:24:14.455337 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:24:14.457448 543705 disk_worker.go:494] system disk:vda1
I0320 12:24:14.457489 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:24:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:24:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:24:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:24:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:24:16.472515 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:24:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:24:23.409764 543705 memory.go:184] no items to output this cycle
I0320 12:24:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 12:24:24.289671 543705 disk_info.go:125] begin check local disk info of client
I0320 12:24:24.292088 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:24:24.292093 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047ba00 0xc00047ba40]
E0320 12:24:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:24:33.409803 543705 memory.go:184] no items to output this cycle
I0320 12:24:33.409819 543705 cpu.go:275] no items to output this cycle
I0320 12:24:38.469107 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 12:24:38.469115 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 12:24:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:24:43.410585 543705 memory.go:191] Add success.
I0320 12:24:43.409821 543705 cpu.go:282] Add success.
I0320 12:24:43.420341 543705 net.go:648] Add success.
I0320 12:24:43.423052 543705 net.go:770] primary dev: ETH0
I0320 12:24:43.423066 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:24:43.423078 543705 net.go:698] Add success.
I0320 12:24:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:24:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:24:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:24:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:24:53.409806 543705 memory.go:184] no items to output this cycle
I0320 12:24:53.409817 543705 cpu.go:275] no items to output this cycle
E0320 12:25:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:25:03.409803 543705 memory.go:184] no items to output this cycle
I0320 12:25:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 12:25:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:25:13.409778 543705 memory.go:191] Add success.
W0320 12:25:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 12:25:13.409804 543705 cpu.go:282] Add success.
W0320 12:25:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:25:13.409819 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:25:13.420065 543705 net.go:648] Add success.
I0320 12:25:13.423093 543705 net.go:770] primary dev: ETH0
I0320 12:25:13.423105 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:25:13.423117 543705 net.go:698] Add success.
I0320 12:25:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:25:14.455255 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:25:14.455318 543705 disk_worker.go:708] disk space is not compliant
W0320 12:25:14.455320 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:25:14.457553 543705 disk_worker.go:494] system disk:vda1
I0320 12:25:14.457594 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:25:15.456013 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:25:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:25:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:25:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:25:16.472394 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:25:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:25:23.409800 543705 memory.go:184] no items to output this cycle
I0320 12:25:23.409812 543705 cpu.go:275] no items to output this cycle
I0320 12:25:24.293669 543705 disk_info.go:125] begin check local disk info of client
I0320 12:25:24.296110 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:25:24.296115 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047af00 0xc00047af40]
E0320 12:25:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:25:33.409778 543705 memory.go:184] no items to output this cycle
I0320 12:25:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 12:25:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:25:43.409802 543705 memory.go:191] Add success.
I0320 12:25:43.409804 543705 cpu.go:282] Add success.
I0320 12:25:43.419965 543705 net.go:648] Add success.
I0320 12:25:43.422979 543705 net.go:770] primary dev: ETH0
I0320 12:25:43.422995 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:25:43.423009 543705 net.go:698] Add success.
I0320 12:25:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:25:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:25:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:25:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:25:53.409765 543705 memory.go:184] no items to output this cycle
I0320 12:25:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 12:26:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:26:03.409778 543705 memory.go:184] no items to output this cycle
I0320 12:26:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 12:26:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:26:13.409778 543705 memory.go:191] Add success.
W0320 12:26:13.409803 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 12:26:13.409811 543705 cpu.go:282] Add success.
W0320 12:26:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:26:13.409817 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:26:13.420061 543705 net.go:648] Add success.
I0320 12:26:13.423378 543705 net.go:770] primary dev: ETH0
I0320 12:26:13.423391 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:26:13.423403 543705 net.go:698] Add success.
I0320 12:26:14.454987 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:26:14.455185 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:26:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 12:26:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:26:14.456655 543705 disk_worker.go:494] system disk:vda1
I0320 12:26:14.456700 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:26:15.456017 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:26:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:26:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:26:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:26:16.472412 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:26:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:26:23.409773 543705 memory.go:184] no items to output this cycle
I0320 12:26:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 12:26:24.297674 543705 disk_info.go:125] begin check local disk info of client
I0320 12:26:24.300098 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:26:24.300104 543705 disk_info.go:196] parse disk info done, disk is : [0xc00029b1c0 0xc00029b200]
E0320 12:26:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:26:33.409797 543705 memory.go:184] no items to output this cycle
I0320 12:26:33.409802 543705 cpu.go:275] no items to output this cycle
E0320 12:26:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:26:43.409794 543705 memory.go:191] Add success.
I0320 12:26:43.409800 543705 cpu.go:282] Add success.
I0320 12:26:43.419923 543705 net.go:648] Add success.
I0320 12:26:43.422641 543705 net.go:770] primary dev: ETH0
I0320 12:26:43.422654 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:26:43.422667 543705 net.go:698] Add success.
I0320 12:26:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:26:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:26:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:26:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:26:53.409767 543705 memory.go:184] no items to output this cycle
I0320 12:26:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 12:27:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:27:03.409771 543705 memory.go:184] no items to output this cycle
I0320 12:27:03.409806 543705 cpu.go:275] no items to output this cycle
E0320 12:27:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:27:13.409808 543705 memory.go:191] Add success.
I0320 12:27:13.409820 543705 cpu.go:282] Add success.
W0320 12:27:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:27:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:27:13.409860 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:27:13.420126 543705 net.go:648] Add success.
I0320 12:27:13.422882 543705 net.go:770] primary dev: ETH0
I0320 12:27:13.422895 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:27:13.422907 543705 net.go:698] Add success.
I0320 12:27:13.428841 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 12:27:13.453078 543705 event_worker.go:152] Polling the log file for events...
I0320 12:27:13.468874 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"af08c090-ea97-4430-9b8e-3773364b8bc7","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 12:27:13.468908 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 12:27:14.455124 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:27:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 12:27:14.455188 543705 disk_worker.go:728] disk inode is not compliant
E0320 12:27:14.456756 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 12:27:14.456765 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 12:27:14.456771 543705 custom_config.go:64] query custom config with name: gpu
I0320 12:27:14.456970 543705 disk_worker.go:494] system disk:vda1
I0320 12:27:14.457025 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 12:27:15.456812 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 12:27:15.456819 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:27:16.457905 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 12:27:16.457905 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 12:27:16.457961 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:27:16.457979 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:27:16.472308 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:27:23.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:27:23.409761 543705 memory.go:184] no items to output this cycle
I0320 12:27:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 12:27:24.301671 543705 disk_info.go:125] begin check local disk info of client
I0320 12:27:24.304078 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:27:24.304084 543705 disk_info.go:196] parse disk info done, disk is : [0xc000518b40 0xc000518b80]
E0320 12:27:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:27:33.409805 543705 memory.go:184] no items to output this cycle
I0320 12:27:33.409814 543705 cpu.go:275] no items to output this cycle
I0320 12:27:38.470108 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 12:27:38.470114 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 12:27:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:27:43.410628 543705 memory.go:191] Add success.
I0320 12:27:43.409815 543705 cpu.go:282] Add success.
I0320 12:27:43.420428 543705 net.go:648] Add success.
I0320 12:27:43.423002 543705 net.go:770] primary dev: ETH0
I0320 12:27:43.423021 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:27:43.423037 543705 net.go:698] Add success.
I0320 12:27:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:27:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:27:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:27:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:27:53.409773 543705 cpu.go:275] no items to output this cycle
I0320 12:27:53.409780 543705 memory.go:184] no items to output this cycle
E0320 12:28:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:28:03.409783 543705 memory.go:184] no items to output this cycle
I0320 12:28:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 12:28:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:28:13.409799 543705 memory.go:191] Add success.
I0320 12:28:13.409799 543705 cpu.go:282] Add success.
W0320 12:28:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:28:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:28:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:28:13.420052 543705 net.go:770] primary dev: ETH0
I0320 12:28:13.420067 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:28:13.420083 543705 net.go:698] Add success.
I0320 12:28:13.420450 543705 net.go:648] Add success.
I0320 12:28:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:28:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:28:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 12:28:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:28:14.456598 543705 disk_worker.go:494] system disk:vda1
I0320 12:28:14.456629 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:28:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:28:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:28:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:28:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:28:16.472440 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:28:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:28:23.409773 543705 memory.go:184] no items to output this cycle
I0320 12:28:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 12:28:24.305672 543705 disk_info.go:125] begin check local disk info of client
I0320 12:28:24.308118 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:28:24.308124 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048d3c0 0xc00048d400]
E0320 12:28:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:28:33.409801 543705 memory.go:184] no items to output this cycle
I0320 12:28:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 12:28:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:28:43.409801 543705 memory.go:191] Add success.
I0320 12:28:43.409802 543705 cpu.go:282] Add success.
I0320 12:28:43.419883 543705 net.go:648] Add success.
I0320 12:28:43.422878 543705 net.go:770] primary dev: ETH0
I0320 12:28:43.422893 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:28:43.422907 543705 net.go:698] Add success.
I0320 12:28:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:28:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:28:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:28:53.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:28:53.409811 543705 memory.go:184] no items to output this cycle
I0320 12:28:53.409820 543705 cpu.go:275] no items to output this cycle
E0320 12:29:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:29:03.409807 543705 memory.go:184] no items to output this cycle
I0320 12:29:03.409822 543705 cpu.go:275] no items to output this cycle
E0320 12:29:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:29:13.409795 543705 memory.go:191] Add success.
I0320 12:29:13.409795 543705 cpu.go:282] Add success.
W0320 12:29:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:29:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:29:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:29:13.420095 543705 net.go:648] Add success.
I0320 12:29:13.423324 543705 net.go:770] primary dev: ETH0
I0320 12:29:13.423337 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:29:13.423349 543705 net.go:698] Add success.
I0320 12:29:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:29:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:29:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 12:29:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:29:14.456577 543705 disk_worker.go:494] system disk:vda1
I0320 12:29:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:29:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:29:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:29:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:29:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:29:16.472414 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:29:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:29:23.409780 543705 memory.go:184] no items to output this cycle
I0320 12:29:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 12:29:24.309671 543705 disk_info.go:125] begin check local disk info of client
I0320 12:29:24.312053 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:29:24.312058 543705 disk_info.go:196] parse disk info done, disk is : [0xc000475b40 0xc000475b80]
E0320 12:29:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:29:33.409796 543705 memory.go:184] no items to output this cycle
I0320 12:29:33.409806 543705 cpu.go:275] no items to output this cycle
E0320 12:29:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:29:43.409808 543705 cpu.go:282] Add success.
I0320 12:29:43.409817 543705 memory.go:191] Add success.
I0320 12:29:43.419991 543705 net.go:648] Add success.
I0320 12:29:43.422813 543705 net.go:770] primary dev: ETH0
I0320 12:29:43.422827 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:29:43.422840 543705 net.go:698] Add success.
I0320 12:29:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:29:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:29:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:29:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:29:53.409765 543705 memory.go:184] no items to output this cycle
I0320 12:29:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 12:30:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:30:03.409776 543705 memory.go:184] no items to output this cycle
I0320 12:30:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 12:30:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:30:13.409786 543705 memory.go:191] Add success.
I0320 12:30:13.409802 543705 cpu.go:282] Add success.
W0320 12:30:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:30:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:30:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:30:13.420093 543705 net.go:648] Add success.
I0320 12:30:13.422746 543705 net.go:770] primary dev: ETH0
I0320 12:30:13.422761 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:30:13.422775 543705 net.go:698] Add success.
I0320 12:30:13.763318 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"30c4eb2c-0fb6-4547-86fe-1e5f306fb58e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 12:30:13.763351 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 12:30:14.453982 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:30:14.454184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:30:14.454195 543705 disk_worker.go:708] disk space is not compliant
W0320 12:30:14.454197 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:30:14.455547 543705 disk_worker.go:494] system disk:vda1
I0320 12:30:14.455593 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:30:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:30:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:30:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:30:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:30:16.472394 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:30:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:30:23.409797 543705 memory.go:184] no items to output this cycle
I0320 12:30:23.409807 543705 cpu.go:275] no items to output this cycle
I0320 12:30:24.312149 543705 disk_info.go:125] begin check local disk info of client
I0320 12:30:24.314611 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:30:24.314617 543705 disk_info.go:196] parse disk info done, disk is : [0xc00046a200 0xc00046a240]
E0320 12:30:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:30:33.409784 543705 memory.go:184] no items to output this cycle
I0320 12:30:33.409798 543705 cpu.go:275] no items to output this cycle
I0320 12:30:38.471107 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 12:30:38.471114 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 12:30:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:30:43.410719 543705 memory.go:191] Add success.
I0320 12:30:43.409818 543705 cpu.go:282] Add success.
I0320 12:30:43.420444 543705 net.go:648] Add success.
I0320 12:30:43.423068 543705 net.go:770] primary dev: ETH0
I0320 12:30:43.423085 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:30:43.423099 543705 net.go:698] Add success.
I0320 12:30:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:30:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:30:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:30:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:30:53.409800 543705 memory.go:184] no items to output this cycle
I0320 12:30:53.409815 543705 cpu.go:275] no items to output this cycle
E0320 12:31:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:31:03.409783 543705 memory.go:184] no items to output this cycle
I0320 12:31:03.409790 543705 cpu.go:275] no items to output this cycle
E0320 12:31:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:31:13.409814 543705 memory.go:191] Add success.
I0320 12:31:13.409820 543705 cpu.go:282] Add success.
W0320 12:31:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:31:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:31:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:31:13.420128 543705 net.go:648] Add success.
I0320 12:31:13.422894 543705 net.go:770] primary dev: ETH0
I0320 12:31:13.422908 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:31:13.422935 543705 net.go:698] Add success.
I0320 12:31:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:31:14.455144 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:31:14.455154 543705 disk_worker.go:708] disk space is not compliant
W0320 12:31:14.455157 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:31:14.456473 543705 disk_worker.go:494] system disk:vda1
I0320 12:31:14.456518 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:31:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:31:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:31:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:31:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:31:16.472401 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:31:23.409869 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:31:23.409893 543705 memory.go:184] no items to output this cycle
I0320 12:31:23.409948 543705 cpu.go:275] no items to output this cycle
I0320 12:31:24.317667 543705 disk_info.go:125] begin check local disk info of client
I0320 12:31:24.320138 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:31:24.320144 543705 disk_info.go:196] parse disk info done, disk is : [0xc000474000 0xc000474040]
E0320 12:31:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:31:33.409809 543705 memory.go:184] no items to output this cycle
I0320 12:31:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 12:31:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:31:43.409794 543705 memory.go:191] Add success.
I0320 12:31:43.409799 543705 cpu.go:282] Add success.
I0320 12:31:43.420085 543705 net.go:648] Add success.
I0320 12:31:43.423137 543705 net.go:770] primary dev: ETH0
I0320 12:31:43.423150 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:31:43.423168 543705 net.go:698] Add success.
I0320 12:31:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:31:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:31:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:31:53.410380 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:31:53.410395 543705 memory.go:184] no items to output this cycle
I0320 12:31:53.410398 543705 cpu.go:275] no items to output this cycle
E0320 12:32:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:32:03.409801 543705 memory.go:184] no items to output this cycle
I0320 12:32:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 12:32:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:32:13.409776 543705 memory.go:191] Add success.
I0320 12:32:13.409799 543705 cpu.go:282] Add success.
W0320 12:32:13.409800 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:32:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:32:13.409814 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:32:13.420056 543705 net.go:648] Add success.
I0320 12:32:13.423148 543705 net.go:770] primary dev: ETH0
I0320 12:32:13.423159 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:32:13.423171 543705 net.go:698] Add success.
W0320 12:32:14.455103 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:32:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0320 12:32:14.455167 543705 disk_worker.go:728] disk inode is not compliant
E0320 12:32:14.456912 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 12:32:14.456921 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 12:32:14.456927 543705 custom_config.go:64] query custom config with name: gpu
I0320 12:32:14.457018 543705 disk_worker.go:494] system disk:vda1
I0320 12:32:14.457062 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 12:32:15.456841 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 12:32:15.456851 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:32:16.457937 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 12:32:16.457947 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 12:32:16.457987 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:32:16.458017 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:32:16.472342 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:32:23.409880 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:32:23.409901 543705 memory.go:184] no items to output this cycle
I0320 12:32:23.409905 543705 cpu.go:275] no items to output this cycle
I0320 12:32:24.321670 543705 disk_info.go:125] begin check local disk info of client
I0320 12:32:24.324186 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:32:24.324192 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b27c0 0xc0003b29c0]
E0320 12:32:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:32:33.409802 543705 memory.go:184] no items to output this cycle
I0320 12:32:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 12:32:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:32:43.409812 543705 memory.go:191] Add success.
I0320 12:32:43.409814 543705 cpu.go:282] Add success.
I0320 12:32:43.419965 543705 net.go:648] Add success.
I0320 12:32:43.422756 543705 net.go:770] primary dev: ETH0
I0320 12:32:43.422769 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:32:43.422781 543705 net.go:698] Add success.
I0320 12:32:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:32:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:32:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:32:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:32:53.409805 543705 memory.go:184] no items to output this cycle
I0320 12:32:53.409819 543705 cpu.go:275] no items to output this cycle
E0320 12:33:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:33:03.409780 543705 memory.go:184] no items to output this cycle
I0320 12:33:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 12:33:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:33:13.409783 543705 cpu.go:282] Add success.
I0320 12:33:13.409793 543705 memory.go:191] Add success.
W0320 12:33:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:33:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:33:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:33:13.420234 543705 net.go:648] Add success.
I0320 12:33:13.422854 543705 net.go:770] primary dev: ETH0
I0320 12:33:13.422869 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:33:13.422883 543705 net.go:698] Add success.
I0320 12:33:13.469589 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0707ead8-2365-45cf-b645-6850b2c07903","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 12:33:13.469630 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 12:33:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:33:14.455199 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:33:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0320 12:33:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:33:14.456689 543705 disk_worker.go:494] system disk:vda1
I0320 12:33:14.456720 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:33:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:33:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:33:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:33:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:33:16.472396 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:33:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:33:23.409777 543705 memory.go:184] no items to output this cycle
I0320 12:33:23.409781 543705 cpu.go:275] no items to output this cycle
I0320 12:33:24.325668 543705 disk_info.go:125] begin check local disk info of client
I0320 12:33:24.328093 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:33:24.328099 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b180 0xc00007b1c0]
E0320 12:33:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:33:33.409805 543705 memory.go:184] no items to output this cycle
I0320 12:33:33.409820 543705 cpu.go:275] no items to output this cycle
I0320 12:33:38.472118 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 12:33:38.472136 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 12:33:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:33:43.410657 543705 memory.go:191] Add success.
I0320 12:33:43.409795 543705 cpu.go:282] Add success.
I0320 12:33:43.420365 543705 net.go:648] Add success.
I0320 12:33:43.422870 543705 net.go:770] primary dev: ETH0
I0320 12:33:43.422882 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:33:43.422896 543705 net.go:698] Add success.
I0320 12:33:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:33:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:33:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:33:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:33:53.409796 543705 memory.go:184] no items to output this cycle
I0320 12:33:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 12:34:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:34:03.409774 543705 memory.go:184] no items to output this cycle
I0320 12:34:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 12:34:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:34:13.409807 543705 memory.go:191] Add success.
I0320 12:34:13.409817 543705 cpu.go:282] Add success.
W0320 12:34:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:34:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:34:13.409860 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:34:13.420050 543705 net.go:648] Add success.
I0320 12:34:13.422734 543705 net.go:770] primary dev: ETH0
I0320 12:34:13.422750 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:34:13.422764 543705 net.go:698] Add success.
I0320 12:34:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:34:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:34:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0320 12:34:14.455170 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:34:14.456516 543705 disk_worker.go:494] system disk:vda1
I0320 12:34:14.456561 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:34:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:34:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:34:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:34:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:34:16.472453 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:34:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:34:23.409796 543705 memory.go:184] no items to output this cycle
I0320 12:34:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 12:34:24.329678 543705 disk_info.go:125] begin check local disk info of client
I0320 12:34:24.332102 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:34:24.332107 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5240 0xc0000c5280]
E0320 12:34:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:34:33.409767 543705 memory.go:184] no items to output this cycle
I0320 12:34:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 12:34:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:34:43.409814 543705 memory.go:191] Add success.
I0320 12:34:43.409820 543705 cpu.go:282] Add success.
I0320 12:34:43.419991 543705 net.go:648] Add success.
I0320 12:34:43.422774 543705 net.go:770] primary dev: ETH0
I0320 12:34:43.422787 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:34:43.422799 543705 net.go:698] Add success.
I0320 12:34:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:34:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:34:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:34:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:34:53.409781 543705 memory.go:184] no items to output this cycle
I0320 12:34:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 12:35:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:35:03.409778 543705 memory.go:184] no items to output this cycle
I0320 12:35:03.409781 543705 cpu.go:275] no items to output this cycle
E0320 12:35:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:35:13.409810 543705 memory.go:191] Add success.
I0320 12:35:13.409821 543705 cpu.go:282] Add success.
W0320 12:35:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:35:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:35:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:35:13.420153 543705 net.go:648] Add success.
I0320 12:35:13.422790 543705 net.go:770] primary dev: ETH0
I0320 12:35:13.422803 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:35:13.422815 543705 net.go:698] Add success.
I0320 12:35:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:35:14.455097 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:35:14.455160 543705 disk_worker.go:708] disk space is not compliant
W0320 12:35:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:35:14.456525 543705 disk_worker.go:494] system disk:vda1
I0320 12:35:14.456571 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:35:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:35:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:35:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:35:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:35:16.472370 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:35:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:35:23.409777 543705 memory.go:184] no items to output this cycle
I0320 12:35:23.409779 543705 cpu.go:275] no items to output this cycle
I0320 12:35:24.333669 543705 disk_info.go:125] begin check local disk info of client
I0320 12:35:24.336104 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:35:24.336109 543705 disk_info.go:196] parse disk info done, disk is : [0xc000386940 0xc000386980]
E0320 12:35:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:35:33.409777 543705 memory.go:184] no items to output this cycle
I0320 12:35:33.409806 543705 cpu.go:275] no items to output this cycle
E0320 12:35:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:35:43.409820 543705 memory.go:191] Add success.
I0320 12:35:43.409826 543705 cpu.go:282] Add success.
I0320 12:35:43.419906 543705 net.go:648] Add success.
I0320 12:35:43.422643 543705 net.go:770] primary dev: ETH0
I0320 12:35:43.422657 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:35:43.422670 543705 net.go:698] Add success.
I0320 12:35:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:35:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:35:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:35:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:35:53.409768 543705 memory.go:184] no items to output this cycle
I0320 12:35:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 12:36:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:36:03.409779 543705 cpu.go:275] no items to output this cycle
I0320 12:36:03.409781 543705 memory.go:184] no items to output this cycle
E0320 12:36:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:36:13.409806 543705 memory.go:191] Add success.
I0320 12:36:13.409827 543705 cpu.go:282] Add success.
W0320 12:36:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:36:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:36:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:36:13.420186 543705 net.go:648] Add success.
I0320 12:36:13.422662 543705 net.go:770] primary dev: ETH0
I0320 12:36:13.422677 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:36:13.422691 543705 net.go:698] Add success.
I0320 12:36:13.464618 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"fe32ae03-afad-40c4-9828-eb56e1ffad07","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 12:36:13.464655 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 12:36:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:36:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:36:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 12:36:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:36:14.456600 543705 disk_worker.go:494] system disk:vda1
I0320 12:36:14.456632 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:36:15.455950 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:36:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:36:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:36:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:36:16.472435 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:36:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:36:23.409770 543705 memory.go:184] no items to output this cycle
I0320 12:36:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 12:36:24.338287 543705 disk_info.go:125] begin check local disk info of client
I0320 12:36:24.340790 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:36:24.340797 543705 disk_info.go:196] parse disk info done, disk is : [0xc000474c40 0xc000474c80]
E0320 12:36:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:36:33.409783 543705 memory.go:184] no items to output this cycle
I0320 12:36:33.409790 543705 cpu.go:275] no items to output this cycle
I0320 12:36:38.473112 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 12:36:38.473119 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 12:36:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:36:43.410672 543705 memory.go:191] Add success.
I0320 12:36:43.409805 543705 cpu.go:282] Add success.
I0320 12:36:43.420347 543705 net.go:648] Add success.
I0320 12:36:43.423448 543705 net.go:770] primary dev: ETH0
I0320 12:36:43.423463 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:36:43.423477 543705 net.go:698] Add success.
I0320 12:36:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:36:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:36:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:36:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:36:53.409777 543705 memory.go:184] no items to output this cycle
I0320 12:36:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 12:37:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:37:03.409780 543705 memory.go:184] no items to output this cycle
I0320 12:37:03.409789 543705 cpu.go:275] no items to output this cycle
W0320 12:37:13.409703 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:37:13.409718 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:37:13.409722 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0320 12:37:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:37:13.409806 543705 cpu.go:282] Add success.
I0320 12:37:13.409812 543705 memory.go:191] Add success.
I0320 12:37:13.420049 543705 net.go:648] Add success.
I0320 12:37:13.422707 543705 net.go:770] primary dev: ETH0
I0320 12:37:13.422720 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:37:13.422732 543705 net.go:698] Add success.
I0320 12:37:13.453303 543705 event_worker.go:152] Polling the log file for events...
W0320 12:37:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:37:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0320 12:37:14.455173 543705 disk_worker.go:728] disk inode is not compliant
E0320 12:37:14.455857 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 12:37:14.455866 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 12:37:14.455873 543705 custom_config.go:64] query custom config with name: gpu
I0320 12:37:14.456551 543705 disk_worker.go:494] system disk:vda1
I0320 12:37:14.456581 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 12:37:15.456795 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 12:37:15.456803 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:37:16.457936 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 12:37:16.457936 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 12:37:16.457990 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:37:16.458011 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:37:16.472377 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:37:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:37:23.409799 543705 memory.go:184] no items to output this cycle
I0320 12:37:23.409813 543705 cpu.go:275] no items to output this cycle
I0320 12:37:24.341669 543705 disk_info.go:125] begin check local disk info of client
I0320 12:37:24.344109 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:37:24.344116 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa340 0xc0001aa380]
E0320 12:37:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:37:33.409803 543705 memory.go:184] no items to output this cycle
I0320 12:37:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 12:37:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:37:43.409800 543705 memory.go:191] Add success.
I0320 12:37:43.409804 543705 cpu.go:282] Add success.
I0320 12:37:43.419976 543705 net.go:648] Add success.
I0320 12:37:43.422377 543705 net.go:770] primary dev: ETH0
I0320 12:37:43.422393 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:37:43.422407 543705 net.go:698] Add success.
I0320 12:37:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:37:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:37:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:37:53.410366 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:37:53.410382 543705 memory.go:184] no items to output this cycle
I0320 12:37:53.410385 543705 cpu.go:275] no items to output this cycle
E0320 12:38:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:38:03.409777 543705 memory.go:184] no items to output this cycle
I0320 12:38:03.409782 543705 cpu.go:275] no items to output this cycle
E0320 12:38:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:38:13.409776 543705 memory.go:191] Add success.
W0320 12:38:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 12:38:13.409809 543705 cpu.go:282] Add success.
W0320 12:38:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:38:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:38:13.420054 543705 net.go:648] Add success.
I0320 12:38:13.422862 543705 net.go:770] primary dev: ETH0
I0320 12:38:13.422877 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:38:13.422888 543705 net.go:698] Add success.
I0320 12:38:14.455217 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:38:14.455236 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:38:14.455246 543705 disk_worker.go:708] disk space is not compliant
W0320 12:38:14.455249 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:38:14.456648 543705 disk_worker.go:494] system disk:vda1
I0320 12:38:14.456681 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:38:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:38:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:38:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:38:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:38:16.472417 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:38:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:38:23.409780 543705 memory.go:184] no items to output this cycle
I0320 12:38:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 12:38:24.345669 543705 disk_info.go:125] begin check local disk info of client
I0320 12:38:24.348089 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:38:24.348094 543705 disk_info.go:196] parse disk info done, disk is : [0xc000376e00 0xc000376e40]
E0320 12:38:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:38:33.409794 543705 memory.go:184] no items to output this cycle
I0320 12:38:33.409793 543705 cpu.go:275] no items to output this cycle
E0320 12:38:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:38:43.409791 543705 memory.go:191] Add success.
I0320 12:38:43.409796 543705 cpu.go:282] Add success.
I0320 12:38:43.419919 543705 net.go:648] Add success.
I0320 12:38:43.422659 543705 net.go:770] primary dev: ETH0
I0320 12:38:43.422675 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:38:43.422688 543705 net.go:698] Add success.
I0320 12:38:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:38:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:38:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:38:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:38:53.409775 543705 memory.go:184] no items to output this cycle
I0320 12:38:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 12:39:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:39:03.409803 543705 memory.go:184] no items to output this cycle
I0320 12:39:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 12:39:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:39:13.409788 543705 memory.go:191] Add success.
I0320 12:39:13.409790 543705 cpu.go:282] Add success.
W0320 12:39:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:39:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:39:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:39:13.420048 543705 net.go:648] Add success.
I0320 12:39:13.422789 543705 net.go:770] primary dev: ETH0
I0320 12:39:13.422802 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:39:13.422814 543705 net.go:698] Add success.
I0320 12:39:13.468850 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d8c55910-3126-4cac-a355-131716e4c574","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 12:39:13.468884 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 12:39:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:39:14.455130 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:39:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0320 12:39:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:39:14.457036 543705 disk_worker.go:494] system disk:vda1
I0320 12:39:14.457064 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:39:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:39:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:39:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:39:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:39:16.472390 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:39:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:39:23.409776 543705 memory.go:184] no items to output this cycle
I0320 12:39:23.409779 543705 cpu.go:275] no items to output this cycle
I0320 12:39:24.349670 543705 disk_info.go:125] begin check local disk info of client
I0320 12:39:24.352082 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:39:24.352088 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fd240 0xc0001fd280]
E0320 12:39:33.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:39:33.409814 543705 memory.go:184] no items to output this cycle
I0320 12:39:33.409836 543705 cpu.go:275] no items to output this cycle
I0320 12:39:38.474123 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 12:39:38.474129 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 12:39:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:39:43.410675 543705 memory.go:191] Add success.
I0320 12:39:43.409821 543705 cpu.go:282] Add success.
I0320 12:39:43.420382 543705 net.go:648] Add success.
I0320 12:39:43.423380 543705 net.go:770] primary dev: ETH0
I0320 12:39:43.423394 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:39:43.423411 543705 net.go:698] Add success.
I0320 12:39:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:39:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:39:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:39:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:39:53.409787 543705 memory.go:184] no items to output this cycle
I0320 12:39:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 12:40:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:40:03.409767 543705 memory.go:184] no items to output this cycle
I0320 12:40:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 12:40:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:40:13.409827 543705 memory.go:191] Add success.
I0320 12:40:13.409842 543705 cpu.go:282] Add success.
W0320 12:40:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:40:13.409875 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:40:13.409880 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:40:13.420136 543705 net.go:648] Add success.
I0320 12:40:13.423010 543705 net.go:770] primary dev: ETH0
I0320 12:40:13.423025 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:40:13.423039 543705 net.go:698] Add success.
I0320 12:40:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:40:14.455093 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:40:14.455156 543705 disk_worker.go:708] disk space is not compliant
W0320 12:40:14.455159 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:40:14.456482 543705 disk_worker.go:494] system disk:vda1
I0320 12:40:14.456508 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:40:15.456025 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:40:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:40:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:40:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:40:16.472366 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:40:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:40:23.409787 543705 memory.go:184] no items to output this cycle
I0320 12:40:23.409797 543705 cpu.go:275] no items to output this cycle
I0320 12:40:24.353670 543705 disk_info.go:125] begin check local disk info of client
I0320 12:40:24.356140 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:40:24.356145 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002908c0 0xc000290900]
E0320 12:40:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:40:33.409794 543705 memory.go:184] no items to output this cycle
I0320 12:40:33.409798 543705 cpu.go:275] no items to output this cycle
E0320 12:40:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:40:43.409800 543705 memory.go:191] Add success.
I0320 12:40:43.409818 543705 cpu.go:282] Add success.
I0320 12:40:43.419868 543705 net.go:648] Add success.
I0320 12:40:43.422630 543705 net.go:770] primary dev: ETH0
I0320 12:40:43.422643 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:40:43.422656 543705 net.go:698] Add success.
I0320 12:40:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:40:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:40:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:40:53.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:40:53.409812 543705 memory.go:184] no items to output this cycle
I0320 12:40:53.409822 543705 cpu.go:275] no items to output this cycle
E0320 12:41:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:41:03.409808 543705 memory.go:184] no items to output this cycle
I0320 12:41:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 12:41:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:41:13.409823 543705 memory.go:191] Add success.
I0320 12:41:13.409832 543705 cpu.go:282] Add success.
W0320 12:41:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:41:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:41:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:41:13.420062 543705 net.go:648] Add success.
I0320 12:41:13.423202 543705 net.go:770] primary dev: ETH0
I0320 12:41:13.423215 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:41:13.423226 543705 net.go:698] Add success.
I0320 12:41:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:41:14.455333 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:41:14.455446 543705 disk_worker.go:708] disk space is not compliant
W0320 12:41:14.455451 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:41:14.457568 543705 disk_worker.go:494] system disk:vda1
I0320 12:41:14.457602 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:41:15.455948 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:41:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:41:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:41:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:41:16.472389 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:41:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:41:23.409769 543705 memory.go:184] no items to output this cycle
I0320 12:41:23.409787 543705 cpu.go:275] no items to output this cycle
I0320 12:41:24.357671 543705 disk_info.go:125] begin check local disk info of client
I0320 12:41:24.360139 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:41:24.360145 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb040 0xc0001fb080]
E0320 12:41:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:41:33.409810 543705 memory.go:184] no items to output this cycle
I0320 12:41:33.409837 543705 cpu.go:275] no items to output this cycle
E0320 12:41:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:41:43.409825 543705 memory.go:191] Add success.
I0320 12:41:43.409853 543705 cpu.go:282] Add success.
I0320 12:41:43.420010 543705 net.go:648] Add success.
I0320 12:41:43.423043 543705 net.go:770] primary dev: ETH0
I0320 12:41:43.423056 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:41:43.423068 543705 net.go:698] Add success.
I0320 12:41:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:41:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:41:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:41:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:41:53.409782 543705 memory.go:184] no items to output this cycle
I0320 12:41:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 12:42:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:42:03.409772 543705 memory.go:184] no items to output this cycle
I0320 12:42:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 12:42:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:42:13.409809 543705 memory.go:191] Add success.
I0320 12:42:13.409818 543705 cpu.go:282] Add success.
W0320 12:42:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:42:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:42:13.409854 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:42:13.420229 543705 net.go:648] Add success.
I0320 12:42:13.423095 543705 net.go:770] primary dev: ETH0
I0320 12:42:13.423109 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:42:13.423121 543705 net.go:698] Add success.
I0320 12:42:13.470887 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"fb3c1a60-170a-4b7d-bd50-c4972dc7b522","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 12:42:13.470923 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 12:42:14.455272 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:42:14.455286 543705 disk_worker.go:708] disk space is not compliant
W0320 12:42:14.455289 543705 disk_worker.go:728] disk inode is not compliant
E0320 12:42:14.456709 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 12:42:14.456719 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 12:42:14.456725 543705 custom_config.go:64] query custom config with name: gpu
I0320 12:42:14.456764 543705 disk_worker.go:494] system disk:vda1
I0320 12:42:14.456820 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 12:42:15.456825 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 12:42:15.456833 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:42:16.457954 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 12:42:16.457964 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 12:42:16.458007 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:42:16.458023 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:42:16.472369 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:42:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:42:23.409777 543705 memory.go:184] no items to output this cycle
I0320 12:42:23.409779 543705 cpu.go:275] no items to output this cycle
I0320 12:42:24.361670 543705 disk_info.go:125] begin check local disk info of client
I0320 12:42:24.364109 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:42:24.364115 543705 disk_info.go:196] parse disk info done, disk is : [0xc000324480 0xc0003244c0]
E0320 12:42:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:42:33.409783 543705 memory.go:184] no items to output this cycle
I0320 12:42:33.409803 543705 cpu.go:275] no items to output this cycle
I0320 12:42:38.475126 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 12:42:38.475132 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 12:42:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:42:43.410678 543705 memory.go:191] Add success.
I0320 12:42:43.409821 543705 cpu.go:282] Add success.
I0320 12:42:43.420388 543705 net.go:648] Add success.
I0320 12:42:43.423352 543705 net.go:770] primary dev: ETH0
I0320 12:42:43.423370 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:42:43.423387 543705 net.go:698] Add success.
I0320 12:42:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:42:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:42:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:42:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:42:53.409781 543705 memory.go:184] no items to output this cycle
I0320 12:42:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 12:43:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:43:03.409774 543705 memory.go:184] no items to output this cycle
I0320 12:43:03.409781 543705 cpu.go:275] no items to output this cycle
E0320 12:43:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:43:13.409784 543705 memory.go:191] Add success.
I0320 12:43:13.409806 543705 cpu.go:282] Add success.
W0320 12:43:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:43:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:43:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:43:13.420290 543705 net.go:648] Add success.
I0320 12:43:13.422976 543705 net.go:770] primary dev: ETH0
I0320 12:43:13.422989 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:43:13.423002 543705 net.go:698] Add success.
I0320 12:43:14.453958 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:43:14.455149 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:43:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0320 12:43:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:43:14.456548 543705 disk_worker.go:494] system disk:vda1
I0320 12:43:14.456588 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:43:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:43:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:43:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:43:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:43:16.472398 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:43:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:43:23.409800 543705 memory.go:184] no items to output this cycle
I0320 12:43:23.409813 543705 cpu.go:275] no items to output this cycle
I0320 12:43:24.365669 543705 disk_info.go:125] begin check local disk info of client
I0320 12:43:24.368101 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:43:24.368107 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007bcc0 0xc00007bd00]
E0320 12:43:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:43:33.409812 543705 memory.go:184] no items to output this cycle
I0320 12:43:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 12:43:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:43:43.409802 543705 memory.go:191] Add success.
I0320 12:43:43.409804 543705 cpu.go:282] Add success.
I0320 12:43:43.419993 543705 net.go:648] Add success.
I0320 12:43:43.422782 543705 net.go:770] primary dev: ETH0
I0320 12:43:43.422795 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:43:43.422808 543705 net.go:698] Add success.
I0320 12:43:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:43:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:43:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:43:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:43:53.409780 543705 memory.go:184] no items to output this cycle
I0320 12:43:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 12:44:03.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:44:03.409762 543705 memory.go:184] no items to output this cycle
I0320 12:44:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 12:44:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:44:13.409783 543705 memory.go:191] Add success.
I0320 12:44:13.409804 543705 cpu.go:282] Add success.
W0320 12:44:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:44:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:44:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:44:13.420078 543705 net.go:648] Add success.
I0320 12:44:13.422974 543705 net.go:770] primary dev: ETH0
I0320 12:44:13.422988 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:44:13.423000 543705 net.go:698] Add success.
I0320 12:44:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:44:14.455162 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:44:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0320 12:44:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:44:14.456505 543705 disk_worker.go:494] system disk:vda1
I0320 12:44:14.456549 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:44:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:44:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:44:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:44:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:44:16.472407 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:44:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:44:23.409775 543705 memory.go:184] no items to output this cycle
I0320 12:44:23.409794 543705 cpu.go:275] no items to output this cycle
I0320 12:44:24.369672 543705 disk_info.go:125] begin check local disk info of client
I0320 12:44:24.372147 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:44:24.372153 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb6c0 0xc0001fb700]
E0320 12:44:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:44:33.409810 543705 memory.go:184] no items to output this cycle
I0320 12:44:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 12:44:43.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:44:43.409830 543705 memory.go:191] Add success.
I0320 12:44:43.409843 543705 cpu.go:282] Add success.
I0320 12:44:43.420002 543705 net.go:648] Add success.
I0320 12:44:43.422665 543705 net.go:770] primary dev: ETH0
I0320 12:44:43.422680 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:44:43.422692 543705 net.go:698] Add success.
I0320 12:44:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:44:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:44:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:44:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:44:53.409786 543705 memory.go:184] no items to output this cycle
I0320 12:44:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 12:45:03.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:45:03.409764 543705 memory.go:184] no items to output this cycle
I0320 12:45:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 12:45:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:45:13.409802 543705 memory.go:191] Add success.
I0320 12:45:13.409809 543705 cpu.go:282] Add success.
W0320 12:45:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:45:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:45:13.409853 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:45:13.420041 543705 net.go:648] Add success.
I0320 12:45:13.422953 543705 net.go:770] primary dev: ETH0
I0320 12:45:13.422967 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:45:13.422980 543705 net.go:698] Add success.
I0320 12:45:13.463419 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"49ee4eea-4208-4f10-993f-fcd4c234b924","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 12:45:13.463450 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 12:45:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:45:14.455095 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:45:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0320 12:45:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:45:14.456655 543705 disk_worker.go:494] system disk:vda1
I0320 12:45:14.456680 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:45:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:45:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:45:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:45:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:45:16.472399 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:45:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:45:23.409793 543705 memory.go:184] no items to output this cycle
I0320 12:45:23.409805 543705 cpu.go:275] no items to output this cycle
I0320 12:45:24.373669 543705 disk_info.go:125] begin check local disk info of client
I0320 12:45:24.376152 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:45:24.376159 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fbb40 0xc0001fbb80]
E0320 12:45:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:45:33.409782 543705 memory.go:184] no items to output this cycle
I0320 12:45:33.409802 543705 cpu.go:275] no items to output this cycle
I0320 12:45:38.476139 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 12:45:38.476146 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 12:45:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:45:43.410753 543705 memory.go:191] Add success.
I0320 12:45:43.409826 543705 cpu.go:282] Add success.
I0320 12:45:43.420459 543705 net.go:648] Add success.
I0320 12:45:43.423194 543705 net.go:770] primary dev: ETH0
I0320 12:45:43.423207 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:45:43.423220 543705 net.go:698] Add success.
I0320 12:45:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:45:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:45:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:45:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:45:53.409774 543705 memory.go:184] no items to output this cycle
I0320 12:45:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 12:46:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:46:03.409798 543705 memory.go:184] no items to output this cycle
I0320 12:46:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 12:46:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:46:13.409775 543705 memory.go:191] Add success.
W0320 12:46:13.409801 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 12:46:13.409802 543705 cpu.go:282] Add success.
W0320 12:46:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:46:13.409815 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:46:13.419743 543705 net.go:648] Add success.
I0320 12:46:13.422340 543705 net.go:770] primary dev: ETH0
I0320 12:46:13.422355 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:46:13.422368 543705 net.go:698] Add success.
I0320 12:46:14.454980 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:46:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:46:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 12:46:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:46:14.456582 543705 disk_worker.go:494] system disk:vda1
I0320 12:46:14.456610 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:46:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:46:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:46:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:46:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:46:16.472358 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:46:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:46:23.409777 543705 cpu.go:275] no items to output this cycle
I0320 12:46:23.409788 543705 memory.go:184] no items to output this cycle
I0320 12:46:24.377676 543705 disk_info.go:125] begin check local disk info of client
I0320 12:46:24.380095 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:46:24.380100 543705 disk_info.go:196] parse disk info done, disk is : [0xc000474800 0xc000474840]
E0320 12:46:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:46:33.409805 543705 memory.go:184] no items to output this cycle
I0320 12:46:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 12:46:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:46:43.409824 543705 memory.go:191] Add success.
I0320 12:46:43.409827 543705 cpu.go:282] Add success.
I0320 12:46:43.420043 543705 net.go:648] Add success.
I0320 12:46:43.422684 543705 net.go:770] primary dev: ETH0
I0320 12:46:43.422698 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:46:43.422712 543705 net.go:698] Add success.
I0320 12:46:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:46:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:46:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:46:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:46:53.409784 543705 cpu.go:275] no items to output this cycle
I0320 12:46:53.409790 543705 memory.go:184] no items to output this cycle
E0320 12:47:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:47:03.409781 543705 memory.go:184] no items to output this cycle
I0320 12:47:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 12:47:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:47:13.409782 543705 memory.go:191] Add success.
I0320 12:47:13.409792 543705 cpu.go:282] Add success.
W0320 12:47:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:47:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:47:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:47:13.420595 543705 net.go:648] Add success.
I0320 12:47:13.423404 543705 net.go:770] primary dev: ETH0
I0320 12:47:13.423424 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:47:13.423438 543705 net.go:698] Add success.
I0320 12:47:13.452770 543705 event_worker.go:152] Polling the log file for events...
W0320 12:47:14.455165 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:47:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0320 12:47:14.455178 543705 disk_worker.go:728] disk inode is not compliant
E0320 12:47:14.455900 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 12:47:14.455908 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 12:47:14.455914 543705 custom_config.go:64] query custom config with name: gpu
I0320 12:47:14.456552 543705 disk_worker.go:494] system disk:vda1
I0320 12:47:14.456580 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 12:47:15.456814 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 12:47:15.456823 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:47:16.457951 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 12:47:16.457950 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 12:47:16.458007 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:47:16.458028 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:47:16.472359 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:47:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:47:23.409797 543705 memory.go:184] no items to output this cycle
I0320 12:47:23.409807 543705 cpu.go:275] no items to output this cycle
I0320 12:47:24.381670 543705 disk_info.go:125] begin check local disk info of client
I0320 12:47:24.384156 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:47:24.384161 543705 disk_info.go:196] parse disk info done, disk is : [0xc000394800 0xc000394840]
E0320 12:47:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:47:33.409776 543705 memory.go:184] no items to output this cycle
I0320 12:47:33.409797 543705 cpu.go:275] no items to output this cycle
E0320 12:47:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:47:43.409790 543705 memory.go:191] Add success.
I0320 12:47:43.409812 543705 cpu.go:282] Add success.
I0320 12:47:43.420000 543705 net.go:648] Add success.
I0320 12:47:43.423000 543705 net.go:770] primary dev: ETH0
I0320 12:47:43.423014 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:47:43.423028 543705 net.go:698] Add success.
I0320 12:47:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:47:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:47:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:47:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:47:53.409777 543705 memory.go:184] no items to output this cycle
I0320 12:47:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 12:48:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:48:03.409799 543705 memory.go:184] no items to output this cycle
I0320 12:48:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 12:48:13.409888 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:48:13.409925 543705 memory.go:191] Add success.
I0320 12:48:13.409939 543705 cpu.go:282] Add success.
W0320 12:48:13.410075 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:48:13.410089 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:48:13.410092 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:48:13.419763 543705 net.go:648] Add success.
I0320 12:48:13.422452 543705 net.go:770] primary dev: ETH0
I0320 12:48:13.422465 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:48:13.422476 543705 net.go:698] Add success.
I0320 12:48:13.472435 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d61c4e7b-edad-4eeb-9526-b29eae2d7598","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 12:48:13.472468 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 12:48:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:48:14.455169 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:48:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0320 12:48:14.455182 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:48:14.456814 543705 disk_worker.go:494] system disk:vda1
I0320 12:48:14.456844 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:48:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:48:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:48:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:48:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:48:16.472373 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:48:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:48:23.409775 543705 memory.go:184] no items to output this cycle
I0320 12:48:23.409777 543705 cpu.go:275] no items to output this cycle
I0320 12:48:24.385669 543705 disk_info.go:125] begin check local disk info of client
I0320 12:48:24.388090 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:48:24.388096 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b300 0xc00007b340]
E0320 12:48:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:48:33.409813 543705 memory.go:184] no items to output this cycle
I0320 12:48:33.409824 543705 cpu.go:275] no items to output this cycle
I0320 12:48:38.477139 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 12:48:38.477146 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 12:48:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:48:43.410661 543705 memory.go:191] Add success.
I0320 12:48:43.409828 543705 cpu.go:282] Add success.
I0320 12:48:43.420354 543705 net.go:648] Add success.
I0320 12:48:43.422919 543705 net.go:770] primary dev: ETH0
I0320 12:48:43.422933 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:48:43.422944 543705 net.go:698] Add success.
I0320 12:48:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:48:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:48:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:48:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:48:53.409787 543705 memory.go:184] no items to output this cycle
I0320 12:48:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 12:49:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:49:03.409780 543705 memory.go:184] no items to output this cycle
I0320 12:49:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 12:49:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:49:13.409817 543705 memory.go:191] Add success.
I0320 12:49:13.409827 543705 cpu.go:282] Add success.
W0320 12:49:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:49:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:49:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:49:13.420166 543705 net.go:648] Add success.
I0320 12:49:13.423269 543705 net.go:770] primary dev: ETH0
I0320 12:49:13.423282 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:49:13.423294 543705 net.go:698] Add success.
I0320 12:49:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:49:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:49:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 12:49:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:49:14.456574 543705 disk_worker.go:494] system disk:vda1
I0320 12:49:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:49:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:49:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:49:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:49:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:49:16.472414 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:49:23.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:49:23.409762 543705 memory.go:184] no items to output this cycle
I0320 12:49:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 12:49:24.392002 543705 disk_info.go:125] begin check local disk info of client
I0320 12:49:24.394471 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:49:24.394477 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001faac0 0xc0001fab00]
E0320 12:49:33.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:49:33.409818 543705 memory.go:184] no items to output this cycle
I0320 12:49:33.409836 543705 cpu.go:275] no items to output this cycle
E0320 12:49:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:49:43.409799 543705 memory.go:191] Add success.
I0320 12:49:43.409812 543705 cpu.go:282] Add success.
I0320 12:49:43.419913 543705 net.go:648] Add success.
I0320 12:49:43.422586 543705 net.go:770] primary dev: ETH0
I0320 12:49:43.422601 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:49:43.422615 543705 net.go:698] Add success.
I0320 12:49:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:49:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:49:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:49:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:49:53.409797 543705 memory.go:184] no items to output this cycle
I0320 12:49:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 12:50:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:50:03.409907 543705 memory.go:184] no items to output this cycle
I0320 12:50:03.409908 543705 cpu.go:275] no items to output this cycle
E0320 12:50:13.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:50:13.409781 543705 memory.go:191] Add success.
W0320 12:50:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 12:50:13.409806 543705 cpu.go:282] Add success.
W0320 12:50:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:50:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:50:13.420072 543705 net.go:648] Add success.
I0320 12:50:13.423317 543705 net.go:770] primary dev: ETH0
I0320 12:50:13.423332 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:50:13.423346 543705 net.go:698] Add success.
I0320 12:50:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:50:14.455098 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:50:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0320 12:50:14.455165 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:50:14.456485 543705 disk_worker.go:494] system disk:vda1
I0320 12:50:14.456537 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:50:15.455976 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:50:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:50:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:50:16.458078 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:50:16.472444 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:50:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:50:23.409803 543705 memory.go:184] no items to output this cycle
I0320 12:50:23.409814 543705 cpu.go:275] no items to output this cycle
I0320 12:50:24.397670 543705 disk_info.go:125] begin check local disk info of client
I0320 12:50:24.400156 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:50:24.400161 543705 disk_info.go:196] parse disk info done, disk is : [0xc000289580 0xc0002895c0]
E0320 12:50:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:50:33.409784 543705 memory.go:184] no items to output this cycle
I0320 12:50:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 12:50:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:50:43.409787 543705 memory.go:191] Add success.
I0320 12:50:43.409810 543705 cpu.go:282] Add success.
I0320 12:50:43.419960 543705 net.go:648] Add success.
I0320 12:50:43.422814 543705 net.go:770] primary dev: ETH0
I0320 12:50:43.422828 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:50:43.422841 543705 net.go:698] Add success.
I0320 12:50:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:50:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:50:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:50:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:50:53.409768 543705 memory.go:184] no items to output this cycle
I0320 12:50:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 12:51:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:51:03.409777 543705 memory.go:184] no items to output this cycle
I0320 12:51:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 12:51:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:51:13.409787 543705 memory.go:191] Add success.
W0320 12:51:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 12:51:13.409818 543705 cpu.go:282] Add success.
W0320 12:51:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:51:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:51:13.420100 543705 net.go:648] Add success.
I0320 12:51:13.422898 543705 net.go:770] primary dev: ETH0
I0320 12:51:13.422911 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:51:13.422923 543705 net.go:698] Add success.
I0320 12:51:13.469676 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"be9fc4ec-b1bd-46a1-9b26-00a167ce8845","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 12:51:13.469707 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 12:51:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:51:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:51:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0320 12:51:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:51:14.456594 543705 disk_worker.go:494] system disk:vda1
I0320 12:51:14.456634 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:51:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:51:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:51:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:51:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:51:16.472386 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:51:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:51:23.409774 543705 memory.go:184] no items to output this cycle
I0320 12:51:23.409775 543705 cpu.go:275] no items to output this cycle
I0320 12:51:24.401668 543705 disk_info.go:125] begin check local disk info of client
I0320 12:51:24.404094 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:51:24.404100 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa780 0xc0001aa7c0]
E0320 12:51:33.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:51:33.409822 543705 memory.go:184] no items to output this cycle
I0320 12:51:33.409834 543705 cpu.go:275] no items to output this cycle
I0320 12:51:38.478140 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 12:51:38.478147 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 12:51:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:51:43.410682 543705 memory.go:191] Add success.
I0320 12:51:43.409847 543705 cpu.go:282] Add success.
I0320 12:51:43.420469 543705 net.go:648] Add success.
I0320 12:51:43.423152 543705 net.go:770] primary dev: ETH0
I0320 12:51:43.423167 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:51:43.423181 543705 net.go:698] Add success.
I0320 12:51:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:51:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:51:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:51:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:51:53.409778 543705 memory.go:184] no items to output this cycle
I0320 12:51:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 12:52:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:52:03.409798 543705 memory.go:184] no items to output this cycle
I0320 12:52:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 12:52:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:52:13.409816 543705 memory.go:191] Add success.
I0320 12:52:13.409827 543705 cpu.go:282] Add success.
W0320 12:52:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:52:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:52:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:52:13.420131 543705 net.go:648] Add success.
I0320 12:52:13.423121 543705 net.go:770] primary dev: ETH0
I0320 12:52:13.423134 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:52:13.423147 543705 net.go:698] Add success.
W0320 12:52:14.455083 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:52:14.455143 543705 disk_worker.go:708] disk space is not compliant
W0320 12:52:14.455146 543705 disk_worker.go:728] disk inode is not compliant
E0320 12:52:14.456949 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 12:52:14.456959 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 12:52:14.456965 543705 custom_config.go:64] query custom config with name: gpu
I0320 12:52:14.457010 543705 disk_worker.go:494] system disk:vda1
I0320 12:52:14.457051 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 12:52:15.456848 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 12:52:15.456859 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:52:16.457935 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 12:52:16.457943 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 12:52:16.457986 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:52:16.458003 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:52:16.472344 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:52:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:52:23.409775 543705 memory.go:184] no items to output this cycle
I0320 12:52:23.409802 543705 cpu.go:275] no items to output this cycle
I0320 12:52:24.405673 543705 disk_info.go:125] begin check local disk info of client
I0320 12:52:24.408090 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:52:24.408096 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b1340 0xc0003b1380]
E0320 12:52:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:52:33.409796 543705 memory.go:184] no items to output this cycle
I0320 12:52:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 12:52:43.409803 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:52:43.409835 543705 memory.go:191] Add success.
I0320 12:52:43.409843 543705 cpu.go:282] Add success.
I0320 12:52:43.419972 543705 net.go:648] Add success.
I0320 12:52:43.422882 543705 net.go:770] primary dev: ETH0
I0320 12:52:43.422897 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:52:43.422911 543705 net.go:698] Add success.
I0320 12:52:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:52:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:52:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:52:53.410273 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:52:53.410294 543705 memory.go:184] no items to output this cycle
I0320 12:52:53.410311 543705 cpu.go:275] no items to output this cycle
E0320 12:53:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:53:03.409797 543705 memory.go:184] no items to output this cycle
I0320 12:53:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 12:53:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:53:13.409783 543705 memory.go:191] Add success.
W0320 12:53:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:53:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:53:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:53:13.409855 543705 cpu.go:282] Add success.
I0320 12:53:13.420187 543705 net.go:648] Add success.
I0320 12:53:13.423202 543705 net.go:770] primary dev: ETH0
I0320 12:53:13.423215 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:53:13.423227 543705 net.go:698] Add success.
I0320 12:53:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:53:14.455082 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:53:14.455146 543705 disk_worker.go:708] disk space is not compliant
W0320 12:53:14.455149 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:53:14.456477 543705 disk_worker.go:494] system disk:vda1
I0320 12:53:14.456522 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:53:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:53:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:53:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:53:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:53:16.472456 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:53:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:53:23.409797 543705 memory.go:184] no items to output this cycle
I0320 12:53:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 12:53:24.409676 543705 disk_info.go:125] begin check local disk info of client
I0320 12:53:24.412169 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:53:24.412174 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b700 0xc00007b740]
E0320 12:53:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:53:33.409792 543705 memory.go:184] no items to output this cycle
I0320 12:53:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 12:53:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:53:43.409799 543705 memory.go:191] Add success.
I0320 12:53:43.409816 543705 cpu.go:282] Add success.
I0320 12:53:43.420007 543705 net.go:648] Add success.
I0320 12:53:43.422754 543705 net.go:770] primary dev: ETH0
I0320 12:53:43.422768 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:53:43.422781 543705 net.go:698] Add success.
I0320 12:53:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:53:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:53:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:53:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:53:53.409780 543705 memory.go:184] no items to output this cycle
I0320 12:53:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 12:54:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:54:03.409873 543705 memory.go:184] no items to output this cycle
I0320 12:54:03.409947 543705 cpu.go:275] no items to output this cycle
E0320 12:54:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:54:13.409821 543705 memory.go:191] Add success.
I0320 12:54:13.409828 543705 cpu.go:282] Add success.
W0320 12:54:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:54:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:54:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:54:13.420120 543705 net.go:648] Add success.
I0320 12:54:13.423112 543705 net.go:770] primary dev: ETH0
I0320 12:54:13.423125 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:54:13.423137 543705 net.go:698] Add success.
I0320 12:54:13.470385 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"cb34fea9-9de2-4201-a42a-ca4714fadcfa","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 12:54:13.470420 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 12:54:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:54:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:54:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0320 12:54:14.455176 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:54:14.456530 543705 disk_worker.go:494] system disk:vda1
I0320 12:54:14.456571 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:54:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:54:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:54:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:54:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:54:16.472364 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:54:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:54:23.409801 543705 memory.go:184] no items to output this cycle
I0320 12:54:23.409811 543705 cpu.go:275] no items to output this cycle
I0320 12:54:24.412789 543705 disk_info.go:125] begin check local disk info of client
I0320 12:54:24.415241 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:54:24.415246 543705 disk_info.go:196] parse disk info done, disk is : [0xc000474f00 0xc000474f40]
E0320 12:54:33.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:54:33.409818 543705 memory.go:184] no items to output this cycle
I0320 12:54:33.409832 543705 cpu.go:275] no items to output this cycle
I0320 12:54:38.479137 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 12:54:38.479144 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 12:54:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:54:43.410864 543705 memory.go:191] Add success.
I0320 12:54:43.409836 543705 cpu.go:282] Add success.
I0320 12:54:43.420551 543705 net.go:648] Add success.
I0320 12:54:43.423376 543705 net.go:770] primary dev: ETH0
I0320 12:54:43.423390 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:54:43.423403 543705 net.go:698] Add success.
I0320 12:54:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:54:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:54:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:54:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:54:53.409800 543705 memory.go:184] no items to output this cycle
I0320 12:54:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 12:55:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:55:03.409782 543705 memory.go:184] no items to output this cycle
I0320 12:55:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 12:55:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:55:13.409783 543705 memory.go:191] Add success.
I0320 12:55:13.409788 543705 cpu.go:282] Add success.
W0320 12:55:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:55:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:55:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:55:13.420076 543705 net.go:648] Add success.
I0320 12:55:13.422770 543705 net.go:770] primary dev: ETH0
I0320 12:55:13.422783 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:55:13.422795 543705 net.go:698] Add success.
I0320 12:55:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:55:14.455185 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:55:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 12:55:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:55:14.456573 543705 disk_worker.go:494] system disk:vda1
I0320 12:55:14.456603 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:55:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:55:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:55:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:55:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:55:16.472387 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:55:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:55:23.409781 543705 memory.go:184] no items to output this cycle
I0320 12:55:23.409789 543705 cpu.go:275] no items to output this cycle
I0320 12:55:24.415790 543705 disk_info.go:125] begin check local disk info of client
I0320 12:55:24.418250 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:55:24.418256 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aaf80 0xc0001aafc0]
E0320 12:55:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:55:33.409806 543705 memory.go:184] no items to output this cycle
I0320 12:55:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 12:55:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:55:43.409787 543705 memory.go:191] Add success.
I0320 12:55:43.409817 543705 cpu.go:282] Add success.
I0320 12:55:43.419872 543705 net.go:648] Add success.
I0320 12:55:43.422578 543705 net.go:770] primary dev: ETH0
I0320 12:55:43.422591 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:55:43.422604 543705 net.go:698] Add success.
I0320 12:55:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:55:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:55:46.458088 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:55:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:55:53.409799 543705 memory.go:184] no items to output this cycle
I0320 12:55:53.409807 543705 cpu.go:275] no items to output this cycle
I0320 12:56:03.409875 543705 cpu.go:275] no items to output this cycle
E0320 12:56:03.409949 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:56:03.409964 543705 memory.go:184] no items to output this cycle
E0320 12:56:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:56:13.409796 543705 cpu.go:282] Add success.
I0320 12:56:13.409805 543705 memory.go:191] Add success.
W0320 12:56:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:56:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:56:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:56:13.420049 543705 net.go:648] Add success.
I0320 12:56:13.422798 543705 net.go:770] primary dev: ETH0
I0320 12:56:13.422810 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:56:13.422822 543705 net.go:698] Add success.
I0320 12:56:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:56:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:56:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0320 12:56:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:56:14.456576 543705 disk_worker.go:494] system disk:vda1
I0320 12:56:14.456606 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:56:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:56:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:56:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:56:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:56:16.472385 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:56:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:56:23.409790 543705 memory.go:184] no items to output this cycle
I0320 12:56:23.409799 543705 cpu.go:275] no items to output this cycle
I0320 12:56:24.418793 543705 disk_info.go:125] begin check local disk info of client
I0320 12:56:24.421211 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:56:24.421216 543705 disk_info.go:196] parse disk info done, disk is : [0xc000475640 0xc000475680]
E0320 12:56:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:56:33.409790 543705 cpu.go:275] no items to output this cycle
I0320 12:56:33.409794 543705 memory.go:184] no items to output this cycle
E0320 12:56:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:56:43.409813 543705 memory.go:191] Add success.
I0320 12:56:43.409817 543705 cpu.go:282] Add success.
I0320 12:56:43.420106 543705 net.go:648] Add success.
I0320 12:56:43.422906 543705 net.go:770] primary dev: ETH0
I0320 12:56:43.422920 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:56:43.422934 543705 net.go:698] Add success.
I0320 12:56:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:56:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:56:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:56:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:56:53.409778 543705 memory.go:184] no items to output this cycle
I0320 12:56:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 12:57:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:57:03.409793 543705 memory.go:184] no items to output this cycle
I0320 12:57:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 12:57:13.410462 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:57:13.410488 543705 memory.go:191] Add success.
W0320 12:57:13.410517 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:57:13.410530 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:57:13.410538 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:57:13.410672 543705 cpu.go:282] Add success.
I0320 12:57:13.419734 543705 net.go:648] Add success.
I0320 12:57:13.428636 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 12:57:13.428721 543705 net.go:770] primary dev: ETH0
I0320 12:57:13.428734 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:57:13.428749 543705 net.go:698] Add success.
I0320 12:57:13.453286 543705 event_worker.go:152] Polling the log file for events...
I0320 12:57:13.581887 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ce70a01f-c668-4a19-b024-831d6620bf47","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 12:57:13.581918 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 12:57:14.455166 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:57:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 12:57:14.455191 543705 disk_worker.go:728] disk inode is not compliant
E0320 12:57:14.455874 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 12:57:14.455883 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 12:57:14.455888 543705 custom_config.go:64] query custom config with name: gpu
I0320 12:57:14.456588 543705 disk_worker.go:494] system disk:vda1
I0320 12:57:14.456616 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 12:57:15.456822 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 12:57:15.456830 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:57:16.457939 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 12:57:16.457938 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 12:57:16.457994 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:57:16.458014 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:57:16.472358 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:57:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:57:23.409767 543705 memory.go:184] no items to output this cycle
I0320 12:57:23.409789 543705 cpu.go:275] no items to output this cycle
I0320 12:57:24.421799 543705 disk_info.go:125] begin check local disk info of client
I0320 12:57:24.424314 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:57:24.424319 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4680 0xc0000c46c0]
E0320 12:57:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:57:33.409783 543705 memory.go:184] no items to output this cycle
I0320 12:57:33.409821 543705 cpu.go:275] no items to output this cycle
I0320 12:57:38.480155 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 12:57:38.480161 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 12:57:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:57:43.410681 543705 memory.go:191] Add success.
I0320 12:57:43.409816 543705 cpu.go:282] Add success.
I0320 12:57:43.420433 543705 net.go:648] Add success.
I0320 12:57:43.423086 543705 net.go:770] primary dev: ETH0
I0320 12:57:43.423102 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:57:43.423117 543705 net.go:698] Add success.
I0320 12:57:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:57:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:57:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:57:53.410276 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:57:53.410302 543705 memory.go:184] no items to output this cycle
I0320 12:57:53.410303 543705 cpu.go:275] no items to output this cycle
E0320 12:58:03.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:58:03.409760 543705 memory.go:184] no items to output this cycle
I0320 12:58:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 12:58:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:58:13.409794 543705 memory.go:191] Add success.
I0320 12:58:13.409795 543705 cpu.go:282] Add success.
W0320 12:58:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:58:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:58:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:58:13.420124 543705 net.go:648] Add success.
I0320 12:58:13.423605 543705 net.go:770] primary dev: ETH0
I0320 12:58:13.423618 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:58:13.423629 543705 net.go:698] Add success.
I0320 12:58:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:58:14.455103 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:58:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 12:58:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:58:14.456586 543705 disk_worker.go:494] system disk:vda1
I0320 12:58:14.456615 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:58:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:58:16.458035 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:58:16.458097 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:58:16.458116 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:58:16.472431 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:58:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:58:23.409762 543705 memory.go:184] no items to output this cycle
I0320 12:58:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 12:58:24.424810 543705 disk_info.go:125] begin check local disk info of client
I0320 12:58:24.427259 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:58:24.427264 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa140 0xc0001fa180]
E0320 12:58:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:58:33.409808 543705 memory.go:184] no items to output this cycle
I0320 12:58:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 12:58:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:58:43.409788 543705 memory.go:191] Add success.
I0320 12:58:43.409809 543705 cpu.go:282] Add success.
I0320 12:58:43.419881 543705 net.go:648] Add success.
I0320 12:58:43.422221 543705 net.go:770] primary dev: ETH0
I0320 12:58:43.422235 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:58:43.422249 543705 net.go:698] Add success.
I0320 12:58:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:58:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:58:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:58:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:58:53.409767 543705 memory.go:184] no items to output this cycle
I0320 12:58:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 12:59:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:59:03.409769 543705 memory.go:184] no items to output this cycle
I0320 12:59:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 12:59:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:59:13.409793 543705 memory.go:191] Add success.
I0320 12:59:13.409794 543705 cpu.go:282] Add success.
W0320 12:59:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 12:59:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 12:59:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 12:59:13.420103 543705 net.go:648] Add success.
I0320 12:59:13.422718 543705 net.go:770] primary dev: ETH0
I0320 12:59:13.422733 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:59:13.422746 543705 net.go:698] Add success.
I0320 12:59:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 12:59:14.455158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 12:59:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0320 12:59:14.455171 543705 disk_worker.go:728] disk inode is not compliant
I0320 12:59:14.456561 543705 disk_worker.go:494] system disk:vda1
I0320 12:59:14.456590 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 12:59:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 12:59:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:59:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:59:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 12:59:16.472370 543705 disk_local_worker.go:436] Get disk info: []
E0320 12:59:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:59:23.409768 543705 memory.go:184] no items to output this cycle
I0320 12:59:23.409801 543705 cpu.go:275] no items to output this cycle
I0320 12:59:24.427829 543705 disk_info.go:125] begin check local disk info of client
I0320 12:59:24.430303 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 12:59:24.430308 543705 disk_info.go:196] parse disk info done, disk is : [0xc000327200 0xc000327240]
E0320 12:59:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:59:33.409774 543705 memory.go:184] no items to output this cycle
I0320 12:59:33.409794 543705 cpu.go:275] no items to output this cycle
E0320 12:59:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:59:43.409807 543705 memory.go:191] Add success.
I0320 12:59:43.409809 543705 cpu.go:282] Add success.
I0320 12:59:43.419944 543705 net.go:648] Add success.
I0320 12:59:43.422706 543705 net.go:770] primary dev: ETH0
I0320 12:59:43.422720 543705 net.go:802] Send network stats successfully!,count is 6
I0320 12:59:43.422732 543705 net.go:698] Add success.
I0320 12:59:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 12:59:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 12:59:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 12:59:53.410275 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 12:59:53.410293 543705 memory.go:184] no items to output this cycle
I0320 12:59:53.410294 543705 cpu.go:275] no items to output this cycle
E0320 13:00:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:00:03.409776 543705 memory.go:184] no items to output this cycle
I0320 13:00:03.409779 543705 cpu.go:275] no items to output this cycle
E0320 13:00:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:00:13.409815 543705 memory.go:191] Add success.
I0320 13:00:13.409823 543705 cpu.go:282] Add success.
W0320 13:00:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:00:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:00:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:00:13.420140 543705 net.go:648] Add success.
I0320 13:00:13.422901 543705 net.go:770] primary dev: ETH0
I0320 13:00:13.422914 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:00:13.422926 543705 net.go:698] Add success.
I0320 13:00:13.464047 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7112539b-ad56-4dbb-b13a-9d150721c68b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 13:00:13.464081 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 13:00:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:00:14.455105 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:00:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0320 13:00:14.455170 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:00:14.456510 543705 disk_worker.go:494] system disk:vda1
I0320 13:00:14.456564 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:00:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:00:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:00:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:00:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:00:16.472384 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:00:23.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:00:23.409811 543705 memory.go:184] no items to output this cycle
I0320 13:00:23.409821 543705 cpu.go:275] no items to output this cycle
I0320 13:00:24.430853 543705 disk_info.go:125] begin check local disk info of client
I0320 13:00:24.433278 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:00:24.433283 543705 disk_info.go:196] parse disk info done, disk is : [0xc000395e40 0xc000395e80]
E0320 13:00:33.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:00:33.409820 543705 memory.go:184] no items to output this cycle
I0320 13:00:33.409832 543705 cpu.go:275] no items to output this cycle
I0320 13:00:38.481158 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 13:00:38.481165 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 13:00:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:00:43.410587 543705 memory.go:191] Add success.
I0320 13:00:43.409805 543705 cpu.go:282] Add success.
I0320 13:00:43.420327 543705 net.go:648] Add success.
I0320 13:00:43.422864 543705 net.go:770] primary dev: ETH0
I0320 13:00:43.422876 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:00:43.422889 543705 net.go:698] Add success.
I0320 13:00:46.458032 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:00:46.458095 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:00:46.458121 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:00:53.410375 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:00:53.410394 543705 memory.go:184] no items to output this cycle
I0320 13:00:53.410401 543705 cpu.go:275] no items to output this cycle
E0320 13:01:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:01:03.409780 543705 memory.go:184] no items to output this cycle
I0320 13:01:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 13:01:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:01:13.409821 543705 memory.go:191] Add success.
I0320 13:01:13.409835 543705 cpu.go:282] Add success.
W0320 13:01:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:01:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:01:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:01:13.420157 543705 net.go:648] Add success.
I0320 13:01:13.422952 543705 net.go:770] primary dev: ETH0
I0320 13:01:13.422965 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:01:13.422977 543705 net.go:698] Add success.
I0320 13:01:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:01:14.455128 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:01:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0320 13:01:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:01:14.456586 543705 disk_worker.go:494] system disk:vda1
I0320 13:01:14.456616 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:01:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:01:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:01:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:01:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:01:16.472376 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:01:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:01:23.409783 543705 memory.go:184] no items to output this cycle
I0320 13:01:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 13:01:24.433856 543705 disk_info.go:125] begin check local disk info of client
I0320 13:01:24.436258 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:01:24.436264 543705 disk_info.go:196] parse disk info done, disk is : [0xc000353bc0 0xc000353c00]
E0320 13:01:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:01:33.409811 543705 memory.go:184] no items to output this cycle
I0320 13:01:33.409822 543705 cpu.go:275] no items to output this cycle
E0320 13:01:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:01:43.409812 543705 memory.go:191] Add success.
I0320 13:01:43.409826 543705 cpu.go:282] Add success.
I0320 13:01:43.419978 543705 net.go:648] Add success.
I0320 13:01:43.422702 543705 net.go:770] primary dev: ETH0
I0320 13:01:43.422716 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:01:43.422729 543705 net.go:698] Add success.
I0320 13:01:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:01:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:01:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:01:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:01:53.409771 543705 memory.go:184] no items to output this cycle
I0320 13:01:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 13:02:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:02:03.409801 543705 memory.go:184] no items to output this cycle
I0320 13:02:03.409836 543705 cpu.go:275] no items to output this cycle
E0320 13:02:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:02:13.409788 543705 memory.go:191] Add success.
I0320 13:02:13.409800 543705 cpu.go:282] Add success.
W0320 13:02:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:02:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:02:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:02:13.420147 543705 net.go:648] Add success.
I0320 13:02:13.422685 543705 net.go:770] primary dev: ETH0
I0320 13:02:13.422698 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:02:13.422710 543705 net.go:698] Add success.
W0320 13:02:14.455145 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:02:14.455156 543705 disk_worker.go:708] disk space is not compliant
W0320 13:02:14.455158 543705 disk_worker.go:728] disk inode is not compliant
E0320 13:02:14.456924 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 13:02:14.456934 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 13:02:14.456940 543705 custom_config.go:64] query custom config with name: gpu
I0320 13:02:14.456987 543705 disk_worker.go:494] system disk:vda1
I0320 13:02:14.457029 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 13:02:15.456904 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 13:02:15.456917 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:02:16.458026 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 13:02:16.458033 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 13:02:16.458079 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:02:16.458097 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:02:16.472492 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:02:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:02:23.409764 543705 memory.go:184] no items to output this cycle
I0320 13:02:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 13:02:24.436877 543705 disk_info.go:125] begin check local disk info of client
I0320 13:02:24.439341 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:02:24.439347 543705 disk_info.go:196] parse disk info done, disk is : [0xc000358300 0xc000358340]
E0320 13:02:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:02:33.409811 543705 memory.go:184] no items to output this cycle
I0320 13:02:33.409823 543705 cpu.go:275] no items to output this cycle
E0320 13:02:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:02:43.409788 543705 memory.go:191] Add success.
I0320 13:02:43.409813 543705 cpu.go:282] Add success.
I0320 13:02:43.419874 543705 net.go:648] Add success.
I0320 13:02:43.423252 543705 net.go:770] primary dev: ETH0
I0320 13:02:43.423266 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:02:43.423277 543705 net.go:698] Add success.
I0320 13:02:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:02:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:02:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:02:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:02:53.409785 543705 memory.go:184] no items to output this cycle
I0320 13:02:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 13:03:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:03:03.409765 543705 memory.go:184] no items to output this cycle
I0320 13:03:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 13:03:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:03:13.409792 543705 memory.go:191] Add success.
I0320 13:03:13.409800 543705 cpu.go:282] Add success.
W0320 13:03:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:03:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:03:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:03:13.420310 543705 net.go:648] Add success.
I0320 13:03:13.423109 543705 net.go:770] primary dev: ETH0
I0320 13:03:13.423122 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:03:13.423133 543705 net.go:698] Add success.
I0320 13:03:13.469603 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f0309c7e-bab0-4dfd-83d5-ff00158536bf","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 13:03:13.469635 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 13:03:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:03:14.455119 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:03:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0320 13:03:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:03:14.456615 543705 disk_worker.go:494] system disk:vda1
I0320 13:03:14.456643 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:03:15.455604 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:03:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:03:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:03:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:03:16.472394 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:03:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:03:23.409778 543705 memory.go:184] no items to output this cycle
I0320 13:03:23.409780 543705 cpu.go:275] no items to output this cycle
I0320 13:03:24.439889 543705 disk_info.go:125] begin check local disk info of client
I0320 13:03:24.442332 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:03:24.442337 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001abb00 0xc0001abb40]
E0320 13:03:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:03:33.409780 543705 memory.go:184] no items to output this cycle
I0320 13:03:33.409800 543705 cpu.go:275] no items to output this cycle
I0320 13:03:38.482167 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 13:03:38.482174 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 13:03:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:03:43.410672 543705 memory.go:191] Add success.
I0320 13:03:43.409822 543705 cpu.go:282] Add success.
I0320 13:03:43.420546 543705 net.go:648] Add success.
I0320 13:03:43.423452 543705 net.go:770] primary dev: ETH0
I0320 13:03:43.423467 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:03:43.423481 543705 net.go:698] Add success.
I0320 13:03:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:03:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:03:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:03:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:03:53.409800 543705 memory.go:184] no items to output this cycle
I0320 13:03:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 13:04:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:04:03.409799 543705 memory.go:184] no items to output this cycle
I0320 13:04:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 13:04:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:04:13.409810 543705 memory.go:191] Add success.
I0320 13:04:13.409819 543705 cpu.go:282] Add success.
W0320 13:04:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:04:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:04:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:04:13.420113 543705 net.go:648] Add success.
I0320 13:04:13.423134 543705 net.go:770] primary dev: ETH0
I0320 13:04:13.423147 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:04:13.423159 543705 net.go:698] Add success.
I0320 13:04:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:04:14.455142 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:04:14.455222 543705 disk_worker.go:708] disk space is not compliant
W0320 13:04:14.455225 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:04:14.456608 543705 disk_worker.go:494] system disk:vda1
I0320 13:04:14.456637 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:04:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:04:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:04:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:04:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:04:16.472449 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:04:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:04:23.409771 543705 memory.go:184] no items to output this cycle
I0320 13:04:23.409788 543705 cpu.go:275] no items to output this cycle
I0320 13:04:24.442908 543705 disk_info.go:125] begin check local disk info of client
I0320 13:04:24.445345 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:04:24.445352 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fae40 0xc0001faec0]
E0320 13:04:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:04:33.409788 543705 memory.go:184] no items to output this cycle
I0320 13:04:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 13:04:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:04:43.409780 543705 memory.go:191] Add success.
I0320 13:04:43.409801 543705 cpu.go:282] Add success.
I0320 13:04:43.420204 543705 net.go:648] Add success.
I0320 13:04:43.423014 543705 net.go:770] primary dev: ETH0
I0320 13:04:43.423027 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:04:43.423038 543705 net.go:698] Add success.
I0320 13:04:46.458007 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:04:46.458077 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:04:46.458105 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:04:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:04:53.409782 543705 memory.go:184] no items to output this cycle
I0320 13:04:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 13:05:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:05:03.409775 543705 memory.go:184] no items to output this cycle
I0320 13:05:03.409790 543705 cpu.go:275] no items to output this cycle
E0320 13:05:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:05:13.409788 543705 memory.go:191] Add success.
I0320 13:05:13.409789 543705 cpu.go:282] Add success.
W0320 13:05:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:05:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:05:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:05:13.420147 543705 net.go:648] Add success.
I0320 13:05:13.422955 543705 net.go:770] primary dev: ETH0
I0320 13:05:13.422968 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:05:13.422979 543705 net.go:698] Add success.
I0320 13:05:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:05:14.455126 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:05:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0320 13:05:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:05:14.456551 543705 disk_worker.go:494] system disk:vda1
I0320 13:05:14.456580 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:05:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:05:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:05:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:05:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:05:16.472381 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:05:23.410409 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:05:23.410424 543705 memory.go:184] no items to output this cycle
I0320 13:05:23.410446 543705 cpu.go:275] no items to output this cycle
I0320 13:05:24.445930 543705 disk_info.go:125] begin check local disk info of client
I0320 13:05:24.448392 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:05:24.448398 543705 disk_info.go:196] parse disk info done, disk is : [0xc000533180 0xc0005331c0]
E0320 13:05:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:05:33.409784 543705 memory.go:184] no items to output this cycle
I0320 13:05:33.409798 543705 cpu.go:275] no items to output this cycle
E0320 13:05:43.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:05:43.409831 543705 memory.go:191] Add success.
I0320 13:05:43.409835 543705 cpu.go:282] Add success.
I0320 13:05:43.419719 543705 net.go:648] Add success.
I0320 13:05:43.422530 543705 net.go:770] primary dev: ETH0
I0320 13:05:43.422543 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:05:43.422554 543705 net.go:698] Add success.
I0320 13:05:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:05:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:05:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:05:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:05:53.409776 543705 memory.go:184] no items to output this cycle
I0320 13:05:53.409780 543705 cpu.go:275] no items to output this cycle
E0320 13:06:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:06:03.409778 543705 memory.go:184] no items to output this cycle
I0320 13:06:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 13:06:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:06:13.409808 543705 memory.go:191] Add success.
I0320 13:06:13.409816 543705 cpu.go:282] Add success.
W0320 13:06:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:06:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:06:13.409860 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:06:13.420059 543705 net.go:648] Add success.
I0320 13:06:13.422692 543705 net.go:770] primary dev: ETH0
I0320 13:06:13.422707 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:06:13.422721 543705 net.go:698] Add success.
I0320 13:06:13.469840 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"657f17c0-ee99-4c7d-9406-49f240d7e7e7","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 13:06:13.469874 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 13:06:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:06:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:06:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 13:06:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:06:14.456592 543705 disk_worker.go:494] system disk:vda1
I0320 13:06:14.456622 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:06:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:06:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:06:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:06:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:06:16.472380 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:06:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:06:23.409787 543705 cpu.go:275] no items to output this cycle
I0320 13:06:23.409793 543705 memory.go:184] no items to output this cycle
I0320 13:06:24.448934 543705 disk_info.go:125] begin check local disk info of client
I0320 13:06:24.451420 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:06:24.451425 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b6500 0xc0002b6540]
E0320 13:06:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:06:33.409775 543705 memory.go:184] no items to output this cycle
I0320 13:06:33.409798 543705 cpu.go:275] no items to output this cycle
I0320 13:06:38.483177 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 13:06:38.483183 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 13:06:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:06:43.410650 543705 memory.go:191] Add success.
I0320 13:06:43.409816 543705 cpu.go:282] Add success.
I0320 13:06:43.420642 543705 net.go:648] Add success.
I0320 13:06:43.423377 543705 net.go:770] primary dev: ETH0
I0320 13:06:43.423390 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:06:43.423402 543705 net.go:698] Add success.
I0320 13:06:46.458015 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:06:46.458089 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:06:46.458125 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:06:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:06:53.409778 543705 memory.go:184] no items to output this cycle
I0320 13:06:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 13:07:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:07:03.409783 543705 memory.go:184] no items to output this cycle
I0320 13:07:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 13:07:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:07:13.409806 543705 memory.go:191] Add success.
I0320 13:07:13.409815 543705 cpu.go:282] Add success.
W0320 13:07:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:07:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:07:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:07:13.420085 543705 net.go:648] Add success.
I0320 13:07:13.422873 543705 net.go:770] primary dev: ETH0
I0320 13:07:13.422888 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:07:13.422901 543705 net.go:698] Add success.
I0320 13:07:13.453446 543705 event_worker.go:152] Polling the log file for events...
W0320 13:07:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:07:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 13:07:14.455168 543705 disk_worker.go:728] disk inode is not compliant
E0320 13:07:14.456951 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 13:07:14.456961 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 13:07:14.456967 543705 custom_config.go:64] query custom config with name: gpu
I0320 13:07:14.457006 543705 disk_worker.go:494] system disk:vda1
I0320 13:07:14.457034 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 13:07:15.456854 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 13:07:15.456861 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:07:16.457909 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 13:07:16.457909 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 13:07:16.457965 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:07:16.457985 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:07:16.472442 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:07:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:07:23.409785 543705 memory.go:184] no items to output this cycle
I0320 13:07:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 13:07:24.451941 543705 disk_info.go:125] begin check local disk info of client
I0320 13:07:24.454428 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:07:24.454434 543705 disk_info.go:196] parse disk info done, disk is : [0xc000260b40 0xc000260b80]
E0320 13:07:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:07:33.409787 543705 cpu.go:275] no items to output this cycle
I0320 13:07:33.409792 543705 memory.go:184] no items to output this cycle
E0320 13:07:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:07:43.409807 543705 memory.go:191] Add success.
I0320 13:07:43.409807 543705 cpu.go:282] Add success.
I0320 13:07:43.419756 543705 net.go:648] Add success.
I0320 13:07:43.422262 543705 net.go:770] primary dev: ETH0
I0320 13:07:43.422275 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:07:43.422288 543705 net.go:698] Add success.
I0320 13:07:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:07:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:07:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:07:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:07:53.409798 543705 memory.go:184] no items to output this cycle
I0320 13:07:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 13:08:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:08:03.409776 543705 cpu.go:275] no items to output this cycle
I0320 13:08:03.409780 543705 memory.go:184] no items to output this cycle
E0320 13:08:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:08:13.409787 543705 memory.go:191] Add success.
I0320 13:08:13.409786 543705 cpu.go:282] Add success.
W0320 13:08:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:08:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:08:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:08:13.420215 543705 net.go:648] Add success.
I0320 13:08:13.423285 543705 net.go:770] primary dev: ETH0
I0320 13:08:13.423299 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:08:13.423310 543705 net.go:698] Add success.
I0320 13:08:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:08:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:08:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0320 13:08:14.455176 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:08:14.456558 543705 disk_worker.go:494] system disk:vda1
I0320 13:08:14.456591 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:08:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:08:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:08:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:08:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:08:16.472389 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:08:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:08:23.409779 543705 memory.go:184] no items to output this cycle
I0320 13:08:23.409797 543705 cpu.go:275] no items to output this cycle
I0320 13:08:24.454964 543705 disk_info.go:125] begin check local disk info of client
I0320 13:08:24.457455 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:08:24.457461 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b64c0 0xc0002b6500]
E0320 13:08:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:08:33.409782 543705 memory.go:184] no items to output this cycle
I0320 13:08:33.409802 543705 cpu.go:275] no items to output this cycle
E0320 13:08:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:08:43.409790 543705 memory.go:191] Add success.
I0320 13:08:43.409821 543705 cpu.go:282] Add success.
I0320 13:08:43.420223 543705 net.go:648] Add success.
I0320 13:08:43.422894 543705 net.go:770] primary dev: ETH0
I0320 13:08:43.422909 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:08:43.422923 543705 net.go:698] Add success.
I0320 13:08:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:08:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:08:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:08:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:08:53.409773 543705 memory.go:184] no items to output this cycle
I0320 13:08:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 13:09:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:09:03.409799 543705 memory.go:184] no items to output this cycle
I0320 13:09:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 13:09:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:09:13.409786 543705 memory.go:191] Add success.
I0320 13:09:13.409787 543705 cpu.go:282] Add success.
W0320 13:09:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:09:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:09:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:09:13.420122 543705 net.go:648] Add success.
I0320 13:09:13.423035 543705 net.go:770] primary dev: ETH0
I0320 13:09:13.423048 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:09:13.423059 543705 net.go:698] Add success.
I0320 13:09:13.470626 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"6402604d-f1bd-497c-a7fb-611381a46881","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 13:09:13.470660 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 13:09:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:09:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:09:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 13:09:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:09:14.456515 543705 disk_worker.go:494] system disk:vda1
I0320 13:09:14.456558 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:09:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:09:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:09:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:09:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:09:16.472377 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:09:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:09:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 13:09:23.409791 543705 memory.go:184] no items to output this cycle
I0320 13:09:24.457975 543705 disk_info.go:125] begin check local disk info of client
I0320 13:09:24.460451 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:09:24.460457 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004af800 0xc0004af840]
E0320 13:09:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:09:33.409790 543705 memory.go:184] no items to output this cycle
I0320 13:09:33.409807 543705 cpu.go:275] no items to output this cycle
I0320 13:09:38.484166 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 13:09:38.484173 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 13:09:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:09:43.410571 543705 memory.go:191] Add success.
I0320 13:09:43.409817 543705 cpu.go:282] Add success.
I0320 13:09:43.420287 543705 net.go:648] Add success.
I0320 13:09:43.422853 543705 net.go:770] primary dev: ETH0
I0320 13:09:43.422867 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:09:43.422879 543705 net.go:698] Add success.
I0320 13:09:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:09:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:09:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:09:53.409906 543705 cpu.go:275] no items to output this cycle
E0320 13:09:53.409921 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:09:53.409973 543705 memory.go:184] no items to output this cycle
E0320 13:10:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:10:03.409782 543705 memory.go:184] no items to output this cycle
I0320 13:10:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 13:10:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:10:13.409779 543705 memory.go:191] Add success.
I0320 13:10:13.409802 543705 cpu.go:282] Add success.
W0320 13:10:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:10:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:10:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:10:13.420122 543705 net.go:648] Add success.
I0320 13:10:13.422930 543705 net.go:770] primary dev: ETH0
I0320 13:10:13.422943 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:10:13.422955 543705 net.go:698] Add success.
I0320 13:10:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:10:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:10:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 13:10:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:10:14.456561 543705 disk_worker.go:494] system disk:vda1
I0320 13:10:14.456591 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:10:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:10:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:10:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:10:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:10:16.472372 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:10:23.410384 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:10:23.410400 543705 cpu.go:275] no items to output this cycle
I0320 13:10:23.410404 543705 memory.go:184] no items to output this cycle
I0320 13:10:24.460994 543705 disk_info.go:125] begin check local disk info of client
I0320 13:10:24.463473 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:10:24.463480 543705 disk_info.go:196] parse disk info done, disk is : [0xc000347740 0xc000347780]
E0320 13:10:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:10:33.409778 543705 memory.go:184] no items to output this cycle
I0320 13:10:33.409803 543705 cpu.go:275] no items to output this cycle
E0320 13:10:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:10:43.409816 543705 memory.go:191] Add success.
I0320 13:10:43.409817 543705 cpu.go:282] Add success.
I0320 13:10:43.419895 543705 net.go:648] Add success.
I0320 13:10:43.423006 543705 net.go:770] primary dev: ETH0
I0320 13:10:43.423020 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:10:43.423032 543705 net.go:698] Add success.
I0320 13:10:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:10:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:10:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:10:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:10:53.409786 543705 cpu.go:275] no items to output this cycle
I0320 13:10:53.409795 543705 memory.go:184] no items to output this cycle
E0320 13:11:03.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:11:03.409768 543705 memory.go:184] no items to output this cycle
I0320 13:11:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 13:11:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:11:13.409811 543705 memory.go:191] Add success.
I0320 13:11:13.409825 543705 cpu.go:282] Add success.
W0320 13:11:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:11:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:11:13.409856 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:11:13.420147 543705 net.go:648] Add success.
I0320 13:11:13.422850 543705 net.go:770] primary dev: ETH0
I0320 13:11:13.422863 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:11:13.422874 543705 net.go:698] Add success.
I0320 13:11:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:11:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:11:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0320 13:11:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:11:14.456584 543705 disk_worker.go:494] system disk:vda1
I0320 13:11:14.456614 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:11:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:11:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:11:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:11:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:11:16.472370 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:11:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:11:23.409788 543705 cpu.go:275] no items to output this cycle
I0320 13:11:23.409796 543705 memory.go:184] no items to output this cycle
I0320 13:11:24.464015 543705 disk_info.go:125] begin check local disk info of client
I0320 13:11:24.466494 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:11:24.466500 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb080 0xc0001fb0c0]
E0320 13:11:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:11:33.409785 543705 memory.go:184] no items to output this cycle
I0320 13:11:33.409805 543705 cpu.go:275] no items to output this cycle
E0320 13:11:43.409802 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:11:43.409840 543705 memory.go:191] Add success.
I0320 13:11:43.409842 543705 cpu.go:282] Add success.
I0320 13:11:43.419995 543705 net.go:648] Add success.
I0320 13:11:43.423080 543705 net.go:770] primary dev: ETH0
I0320 13:11:43.423094 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:11:43.423109 543705 net.go:698] Add success.
I0320 13:11:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:11:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:11:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:11:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:11:53.409791 543705 memory.go:184] no items to output this cycle
I0320 13:11:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 13:12:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:12:03.409806 543705 memory.go:184] no items to output this cycle
I0320 13:12:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 13:12:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:12:13.409792 543705 memory.go:191] Add success.
I0320 13:12:13.409811 543705 cpu.go:282] Add success.
W0320 13:12:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:12:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:12:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:12:13.420054 543705 net.go:648] Add success.
I0320 13:12:13.422624 543705 net.go:770] primary dev: ETH0
I0320 13:12:13.422638 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:12:13.422652 543705 net.go:698] Add success.
I0320 13:12:13.464598 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"45484058-aea0-431d-b8f2-62f8b8b0be91","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 13:12:13.464632 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 13:12:14.455165 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:12:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0320 13:12:14.455179 543705 disk_worker.go:728] disk inode is not compliant
E0320 13:12:14.455899 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 13:12:14.455908 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 13:12:14.455913 543705 custom_config.go:64] query custom config with name: gpu
I0320 13:12:14.456555 543705 disk_worker.go:494] system disk:vda1
I0320 13:12:14.456586 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 13:12:15.456886 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 13:12:15.456894 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:12:16.457918 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 13:12:16.457918 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 13:12:16.457974 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:12:16.457993 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:12:16.472312 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:12:23.409861 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:12:23.409890 543705 memory.go:184] no items to output this cycle
I0320 13:12:23.409916 543705 cpu.go:275] no items to output this cycle
I0320 13:12:24.467028 543705 disk_info.go:125] begin check local disk info of client
I0320 13:12:24.469543 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:12:24.469549 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aae00 0xc0001aae40]
E0320 13:12:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:12:33.409802 543705 memory.go:184] no items to output this cycle
I0320 13:12:33.409808 543705 cpu.go:275] no items to output this cycle
I0320 13:12:38.485178 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 13:12:38.485185 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 13:12:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:12:43.410994 543705 memory.go:191] Add success.
I0320 13:12:43.409828 543705 cpu.go:282] Add success.
I0320 13:12:43.419946 543705 net.go:648] Add success.
I0320 13:12:43.422553 543705 net.go:770] primary dev: ETH0
I0320 13:12:43.422567 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:12:43.422579 543705 net.go:698] Add success.
I0320 13:12:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:12:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:12:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:12:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:12:53.409772 543705 memory.go:184] no items to output this cycle
I0320 13:12:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 13:13:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:13:03.409807 543705 memory.go:184] no items to output this cycle
I0320 13:13:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 13:13:13.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:13:13.409775 543705 memory.go:191] Add success.
W0320 13:13:13.409800 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:13:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:13:13.409824 543705 cpu.go:282] Add success.
I0320 13:13:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:13:13.420067 543705 net.go:648] Add success.
I0320 13:13:13.423067 543705 net.go:770] primary dev: ETH0
I0320 13:13:13.423081 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:13:13.423093 543705 net.go:698] Add success.
I0320 13:13:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:13:14.455152 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:13:14.455164 543705 disk_worker.go:708] disk space is not compliant
W0320 13:13:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:13:14.456496 543705 disk_worker.go:494] system disk:vda1
I0320 13:13:14.456542 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:13:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:13:16.457993 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:13:16.458063 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:13:16.458093 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:13:16.472450 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:13:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:13:23.409773 543705 memory.go:184] no items to output this cycle
I0320 13:13:23.409794 543705 cpu.go:275] no items to output this cycle
I0320 13:13:24.470046 543705 disk_info.go:125] begin check local disk info of client
I0320 13:13:24.472532 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:13:24.472537 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039ea00 0xc00039ea40]
I0320 13:13:33.409906 543705 cpu.go:275] no items to output this cycle
E0320 13:13:33.410045 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:13:33.410059 543705 memory.go:184] no items to output this cycle
E0320 13:13:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:13:43.409826 543705 memory.go:191] Add success.
I0320 13:13:43.409886 543705 cpu.go:282] Add success.
I0320 13:13:43.420229 543705 net.go:648] Add success.
I0320 13:13:43.423086 543705 net.go:770] primary dev: ETH0
I0320 13:13:43.423100 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:13:43.423114 543705 net.go:698] Add success.
I0320 13:13:46.457998 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:13:46.458063 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:13:46.458090 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:13:53.410247 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:13:53.410264 543705 memory.go:184] no items to output this cycle
I0320 13:13:53.410269 543705 cpu.go:275] no items to output this cycle
E0320 13:14:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:14:03.409769 543705 memory.go:184] no items to output this cycle
I0320 13:14:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 13:14:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:14:13.409791 543705 memory.go:191] Add success.
I0320 13:14:13.409793 543705 cpu.go:282] Add success.
W0320 13:14:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:14:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:14:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:14:13.420056 543705 net.go:648] Add success.
I0320 13:14:13.422738 543705 net.go:770] primary dev: ETH0
I0320 13:14:13.422751 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:14:13.422764 543705 net.go:698] Add success.
I0320 13:14:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:14:14.455201 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:14:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0320 13:14:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:14:14.456577 543705 disk_worker.go:494] system disk:vda1
I0320 13:14:14.456608 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:14:15.455978 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:14:16.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:14:16.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:14:16.458078 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:14:16.472447 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:14:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:14:23.409777 543705 memory.go:184] no items to output this cycle
I0320 13:14:23.409795 543705 cpu.go:275] no items to output this cycle
I0320 13:14:24.473101 543705 disk_info.go:125] begin check local disk info of client
I0320 13:14:24.475676 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:14:24.475683 543705 disk_info.go:196] parse disk info done, disk is : [0xc000320000 0xc000320040]
E0320 13:14:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:14:33.409789 543705 memory.go:184] no items to output this cycle
I0320 13:14:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 13:14:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:14:43.409815 543705 memory.go:191] Add success.
I0320 13:14:43.409819 543705 cpu.go:282] Add success.
I0320 13:14:43.419986 543705 net.go:648] Add success.
I0320 13:14:43.422683 543705 net.go:770] primary dev: ETH0
I0320 13:14:43.422697 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:14:43.422709 543705 net.go:698] Add success.
I0320 13:14:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:14:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:14:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:14:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:14:53.409800 543705 memory.go:184] no items to output this cycle
I0320 13:14:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 13:15:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:15:03.409770 543705 memory.go:184] no items to output this cycle
I0320 13:15:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 13:15:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:15:13.409806 543705 memory.go:191] Add success.
I0320 13:15:13.409813 543705 cpu.go:282] Add success.
W0320 13:15:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:15:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:15:13.409856 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:15:13.420133 543705 net.go:648] Add success.
I0320 13:15:13.422989 543705 net.go:770] primary dev: ETH0
I0320 13:15:13.423002 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:15:13.423017 543705 net.go:698] Add success.
I0320 13:15:13.469522 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"64bbc47e-ec84-4abf-831e-12fe511cfbf8","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 13:15:13.469555 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 13:15:14.454986 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:15:14.455232 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:15:14.455243 543705 disk_worker.go:708] disk space is not compliant
W0320 13:15:14.455246 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:15:14.456948 543705 disk_worker.go:494] system disk:vda1
I0320 13:15:14.457006 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:15:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:15:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:15:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:15:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:15:16.472401 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:15:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:15:23.409773 543705 memory.go:184] no items to output this cycle
I0320 13:15:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 13:15:24.475761 543705 disk_info.go:125] begin check local disk info of client
I0320 13:15:24.478289 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:15:24.478294 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002831c0 0xc000283200]
E0320 13:15:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:15:33.409777 543705 memory.go:184] no items to output this cycle
I0320 13:15:33.409797 543705 cpu.go:275] no items to output this cycle
I0320 13:15:38.486187 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 13:15:38.486194 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 13:15:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:15:43.409822 543705 memory.go:191] Add success.
I0320 13:15:43.409825 543705 cpu.go:282] Add success.
I0320 13:15:43.420224 543705 net.go:648] Add success.
I0320 13:15:43.421285 543705 net.go:770] primary dev: ETH0
I0320 13:15:43.421301 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:15:43.421314 543705 net.go:698] Add success.
I0320 13:15:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:15:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:15:46.458092 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:15:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:15:53.409786 543705 cpu.go:275] no items to output this cycle
I0320 13:15:53.409794 543705 memory.go:184] no items to output this cycle
E0320 13:16:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:16:03.409780 543705 memory.go:184] no items to output this cycle
I0320 13:16:03.409790 543705 cpu.go:275] no items to output this cycle
E0320 13:16:13.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:16:13.409782 543705 memory.go:191] Add success.
W0320 13:16:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 13:16:13.409809 543705 cpu.go:282] Add success.
W0320 13:16:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:16:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:16:13.420142 543705 net.go:648] Add success.
I0320 13:16:13.422940 543705 net.go:770] primary dev: ETH0
I0320 13:16:13.422953 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:16:13.422966 543705 net.go:698] Add success.
I0320 13:16:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:16:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:16:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0320 13:16:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:16:14.456829 543705 disk_worker.go:494] system disk:vda1
I0320 13:16:14.456877 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:16:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:16:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:16:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:16:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:16:16.472408 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:16:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:16:23.409775 543705 memory.go:184] no items to output this cycle
I0320 13:16:23.409801 543705 cpu.go:275] no items to output this cycle
I0320 13:16:24.479083 543705 disk_info.go:125] begin check local disk info of client
I0320 13:16:24.481489 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:16:24.481496 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab280 0xc0001ab2c0]
E0320 13:16:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:16:33.409807 543705 memory.go:184] no items to output this cycle
I0320 13:16:33.409817 543705 cpu.go:275] no items to output this cycle
E0320 13:16:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:16:43.409787 543705 memory.go:191] Add success.
I0320 13:16:43.409799 543705 cpu.go:282] Add success.
I0320 13:16:43.419905 543705 net.go:648] Add success.
I0320 13:16:43.422878 543705 net.go:770] primary dev: ETH0
I0320 13:16:43.422893 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:16:43.422907 543705 net.go:698] Add success.
I0320 13:16:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:16:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:16:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:16:53.410332 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:16:53.410348 543705 memory.go:184] no items to output this cycle
I0320 13:16:53.410354 543705 cpu.go:275] no items to output this cycle
E0320 13:17:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:17:03.409783 543705 memory.go:184] no items to output this cycle
I0320 13:17:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 13:17:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:17:13.409796 543705 memory.go:191] Add success.
I0320 13:17:13.409797 543705 cpu.go:282] Add success.
W0320 13:17:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:17:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:17:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:17:13.420490 543705 net.go:648] Add success.
I0320 13:17:13.423680 543705 net.go:770] primary dev: ETH0
I0320 13:17:13.423694 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:17:13.423705 543705 net.go:698] Add success.
I0320 13:17:13.452772 543705 event_worker.go:152] Polling the log file for events...
W0320 13:17:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:17:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 13:17:14.455200 543705 disk_worker.go:728] disk inode is not compliant
E0320 13:17:14.455945 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 13:17:14.455954 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 13:17:14.455960 543705 custom_config.go:64] query custom config with name: gpu
I0320 13:17:14.456570 543705 disk_worker.go:494] system disk:vda1
I0320 13:17:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 13:17:15.456849 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 13:17:15.456859 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:17:16.457956 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 13:17:16.457966 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 13:17:16.458007 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:17:16.458023 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:17:16.472357 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:17:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:17:23.409775 543705 memory.go:184] no items to output this cycle
I0320 13:17:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 13:17:24.482092 543705 disk_info.go:125] begin check local disk info of client
I0320 13:17:24.484505 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:17:24.484511 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004744c0 0xc000474500]
E0320 13:17:33.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:17:33.409818 543705 memory.go:184] no items to output this cycle
I0320 13:17:33.409833 543705 cpu.go:275] no items to output this cycle
E0320 13:17:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:17:43.409804 543705 memory.go:191] Add success.
I0320 13:17:43.409866 543705 cpu.go:282] Add success.
I0320 13:17:43.420264 543705 net.go:648] Add success.
I0320 13:17:43.423301 543705 net.go:770] primary dev: ETH0
I0320 13:17:43.423322 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:17:43.423341 543705 net.go:698] Add success.
I0320 13:17:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:17:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:17:46.458090 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:17:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:17:53.409769 543705 memory.go:184] no items to output this cycle
I0320 13:17:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 13:18:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:18:03.409784 543705 memory.go:184] no items to output this cycle
I0320 13:18:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 13:18:13.409891 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:18:13.409929 543705 memory.go:191] Add success.
W0320 13:18:13.409979 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 13:18:13.409979 543705 cpu.go:282] Add success.
W0320 13:18:13.409997 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:18:13.410002 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:18:13.419734 543705 net.go:648] Add success.
I0320 13:18:13.422404 543705 net.go:770] primary dev: ETH0
I0320 13:18:13.422417 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:18:13.422429 543705 net.go:698] Add success.
I0320 13:18:13.469172 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"aa0669ec-6d41-4fa9-8f40-6fb256539820","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 13:18:13.469212 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 13:18:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:18:14.455204 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:18:14.455216 543705 disk_worker.go:708] disk space is not compliant
W0320 13:18:14.455219 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:18:14.456602 543705 disk_worker.go:494] system disk:vda1
I0320 13:18:14.456632 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:18:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:18:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:18:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:18:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:18:16.472413 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:18:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:18:23.409785 543705 memory.go:184] no items to output this cycle
I0320 13:18:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 13:18:24.485112 543705 disk_info.go:125] begin check local disk info of client
I0320 13:18:24.487540 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:18:24.487545 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002ac3c0 0xc0002ac440]
E0320 13:18:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:18:33.409794 543705 memory.go:184] no items to output this cycle
I0320 13:18:33.409811 543705 cpu.go:275] no items to output this cycle
I0320 13:18:38.487187 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 13:18:38.487194 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 13:18:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:18:43.410757 543705 memory.go:191] Add success.
I0320 13:18:43.409846 543705 cpu.go:282] Add success.
I0320 13:18:43.420322 543705 net.go:770] primary dev: ETH0
I0320 13:18:43.420336 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:18:43.420348 543705 net.go:698] Add success.
I0320 13:18:43.420588 543705 net.go:648] Add success.
I0320 13:18:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:18:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:18:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:18:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:18:53.409781 543705 memory.go:184] no items to output this cycle
I0320 13:18:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 13:19:03.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:19:03.409814 543705 memory.go:184] no items to output this cycle
I0320 13:19:03.409825 543705 cpu.go:275] no items to output this cycle
E0320 13:19:13.409862 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:19:13.409893 543705 memory.go:191] Add success.
W0320 13:19:13.409922 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:19:13.409934 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:19:13.409937 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:19:13.409953 543705 cpu.go:282] Add success.
I0320 13:19:13.419756 543705 net.go:648] Add success.
I0320 13:19:13.422483 543705 net.go:770] primary dev: ETH0
I0320 13:19:13.422498 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:19:13.422511 543705 net.go:698] Add success.
I0320 13:19:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:19:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:19:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 13:19:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:19:14.456557 543705 disk_worker.go:494] system disk:vda1
I0320 13:19:14.456587 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:19:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:19:16.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:19:16.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:19:16.458083 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:19:16.472401 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:19:23.410675 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:19:23.410691 543705 memory.go:184] no items to output this cycle
I0320 13:19:23.410720 543705 cpu.go:275] no items to output this cycle
I0320 13:19:24.488135 543705 disk_info.go:125] begin check local disk info of client
I0320 13:19:24.490561 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:19:24.490566 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048db80 0xc00048dbc0]
E0320 13:19:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:19:33.409816 543705 memory.go:184] no items to output this cycle
I0320 13:19:33.409830 543705 cpu.go:275] no items to output this cycle
E0320 13:19:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:19:43.409780 543705 memory.go:191] Add success.
I0320 13:19:43.409808 543705 cpu.go:282] Add success.
I0320 13:19:43.419901 543705 net.go:648] Add success.
I0320 13:19:43.422833 543705 net.go:770] primary dev: ETH0
I0320 13:19:43.422847 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:19:43.422861 543705 net.go:698] Add success.
I0320 13:19:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:19:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:19:46.458091 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:19:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:19:53.409773 543705 memory.go:184] no items to output this cycle
I0320 13:19:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 13:20:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:20:03.409796 543705 memory.go:184] no items to output this cycle
I0320 13:20:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 13:20:13.409897 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:20:13.409941 543705 memory.go:191] Add success.
I0320 13:20:13.409967 543705 cpu.go:282] Add success.
W0320 13:20:13.409980 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:20:13.410003 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:20:13.410008 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:20:13.419714 543705 net.go:648] Add success.
I0320 13:20:13.422400 543705 net.go:770] primary dev: ETH0
I0320 13:20:13.422413 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:20:13.422424 543705 net.go:698] Add success.
I0320 13:20:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:20:14.455188 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:20:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 13:20:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:20:14.456561 543705 disk_worker.go:494] system disk:vda1
I0320 13:20:14.456589 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:20:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:20:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:20:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:20:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:20:16.472375 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:20:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:20:23.409800 543705 memory.go:184] no items to output this cycle
I0320 13:20:23.409813 543705 cpu.go:275] no items to output this cycle
I0320 13:20:24.491144 543705 disk_info.go:125] begin check local disk info of client
I0320 13:20:24.493565 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:20:24.493570 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048cc40 0xc00048cc80]
E0320 13:20:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:20:33.409807 543705 memory.go:184] no items to output this cycle
I0320 13:20:33.409822 543705 cpu.go:275] no items to output this cycle
E0320 13:20:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:20:43.409778 543705 memory.go:191] Add success.
I0320 13:20:43.409806 543705 cpu.go:282] Add success.
I0320 13:20:43.419900 543705 net.go:648] Add success.
I0320 13:20:43.422615 543705 net.go:770] primary dev: ETH0
I0320 13:20:43.422630 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:20:43.422644 543705 net.go:698] Add success.
I0320 13:20:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:20:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:20:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:20:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:20:53.409806 543705 memory.go:184] no items to output this cycle
I0320 13:20:53.409820 543705 cpu.go:275] no items to output this cycle
E0320 13:21:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:21:03.409770 543705 memory.go:184] no items to output this cycle
I0320 13:21:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 13:21:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:21:13.409788 543705 memory.go:191] Add success.
I0320 13:21:13.409803 543705 cpu.go:282] Add success.
W0320 13:21:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:21:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:21:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:21:13.419736 543705 net.go:648] Add success.
I0320 13:21:13.422666 543705 net.go:770] primary dev: ETH0
I0320 13:21:13.422678 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:21:13.422690 543705 net.go:698] Add success.
I0320 13:21:13.469123 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"90c2c9db-c422-4d79-bce5-94fecd47114f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 13:21:13.469155 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 13:21:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:21:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:21:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 13:21:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:21:14.456539 543705 disk_worker.go:494] system disk:vda1
I0320 13:21:14.456580 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:21:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:21:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:21:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:21:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:21:16.472394 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:21:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:21:23.409800 543705 memory.go:184] no items to output this cycle
I0320 13:21:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 13:21:24.494162 543705 disk_info.go:125] begin check local disk info of client
I0320 13:21:24.496582 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:21:24.496587 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035ca80 0xc00035cac0]
E0320 13:21:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:21:33.409783 543705 memory.go:184] no items to output this cycle
I0320 13:21:33.409817 543705 cpu.go:275] no items to output this cycle
I0320 13:21:38.488198 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 13:21:38.488205 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 13:21:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:21:43.410557 543705 memory.go:191] Add success.
I0320 13:21:43.409808 543705 cpu.go:282] Add success.
I0320 13:21:43.420298 543705 net.go:648] Add success.
I0320 13:21:43.422982 543705 net.go:770] primary dev: ETH0
I0320 13:21:43.422995 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:21:43.423008 543705 net.go:698] Add success.
I0320 13:21:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:21:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:21:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:21:53.410214 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:21:53.410230 543705 memory.go:184] no items to output this cycle
I0320 13:21:53.410237 543705 cpu.go:275] no items to output this cycle
E0320 13:22:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:22:03.409770 543705 memory.go:184] no items to output this cycle
I0320 13:22:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 13:22:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:22:13.409791 543705 memory.go:191] Add success.
I0320 13:22:13.409812 543705 cpu.go:282] Add success.
W0320 13:22:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:22:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:22:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:22:13.419708 543705 net.go:648] Add success.
I0320 13:22:13.422398 543705 net.go:770] primary dev: ETH0
I0320 13:22:13.422411 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:22:13.422422 543705 net.go:698] Add success.
W0320 13:22:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:22:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 13:22:14.455184 543705 disk_worker.go:728] disk inode is not compliant
E0320 13:22:14.455855 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 13:22:14.455864 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 13:22:14.455870 543705 custom_config.go:64] query custom config with name: gpu
I0320 13:22:14.456612 543705 disk_worker.go:494] system disk:vda1
I0320 13:22:14.456653 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 13:22:15.456855 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 13:22:15.456864 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:22:16.457934 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 13:22:16.457935 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 13:22:16.457989 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:22:16.458019 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:22:16.472369 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:22:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:22:23.409775 543705 memory.go:184] no items to output this cycle
I0320 13:22:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 13:22:24.496856 543705 disk_info.go:125] begin check local disk info of client
I0320 13:22:24.499352 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:22:24.499357 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c47c0 0xc0000c4800]
E0320 13:22:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:22:33.409799 543705 memory.go:184] no items to output this cycle
I0320 13:22:33.409800 543705 cpu.go:275] no items to output this cycle
E0320 13:22:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:22:43.409823 543705 memory.go:191] Add success.
I0320 13:22:43.409827 543705 cpu.go:282] Add success.
I0320 13:22:43.419979 543705 net.go:648] Add success.
I0320 13:22:43.422532 543705 net.go:770] primary dev: ETH0
I0320 13:22:43.422545 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:22:43.422557 543705 net.go:698] Add success.
I0320 13:22:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:22:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:22:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:22:53.410411 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:22:53.410428 543705 memory.go:184] no items to output this cycle
I0320 13:22:53.410450 543705 cpu.go:275] no items to output this cycle
E0320 13:23:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:23:03.409773 543705 memory.go:184] no items to output this cycle
I0320 13:23:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 13:23:13.409878 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:23:13.409908 543705 memory.go:191] Add success.
W0320 13:23:13.409935 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:23:13.409952 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:23:13.409955 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:23:13.409990 543705 cpu.go:282] Add success.
I0320 13:23:13.419717 543705 net.go:648] Add success.
I0320 13:23:13.422640 543705 net.go:770] primary dev: ETH0
I0320 13:23:13.422655 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:23:13.422668 543705 net.go:698] Add success.
I0320 13:23:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:23:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:23:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 13:23:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:23:14.456502 543705 disk_worker.go:494] system disk:vda1
I0320 13:23:14.456546 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:23:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:23:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:23:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:23:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:23:16.472404 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:23:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:23:23.409784 543705 memory.go:184] no items to output this cycle
I0320 13:23:23.409787 543705 cpu.go:275] no items to output this cycle
I0320 13:23:24.500187 543705 disk_info.go:125] begin check local disk info of client
I0320 13:23:24.502627 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:23:24.502633 543705 disk_info.go:196] parse disk info done, disk is : [0xc000475240 0xc000475280]
E0320 13:23:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:23:33.409806 543705 memory.go:184] no items to output this cycle
I0320 13:23:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 13:23:43.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:23:43.409825 543705 memory.go:191] Add success.
I0320 13:23:43.409830 543705 cpu.go:282] Add success.
I0320 13:23:43.419978 543705 net.go:648] Add success.
I0320 13:23:43.422935 543705 net.go:770] primary dev: ETH0
I0320 13:23:43.422949 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:23:43.422962 543705 net.go:698] Add success.
I0320 13:23:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:23:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:23:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:23:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:23:53.409783 543705 memory.go:184] no items to output this cycle
I0320 13:23:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 13:24:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:24:03.409773 543705 memory.go:184] no items to output this cycle
I0320 13:24:03.409778 543705 cpu.go:275] no items to output this cycle
E0320 13:24:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:24:13.409900 543705 memory.go:191] Add success.
I0320 13:24:13.409901 543705 cpu.go:282] Add success.
W0320 13:24:13.409931 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:24:13.409952 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:24:13.409957 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:24:13.419715 543705 net.go:648] Add success.
I0320 13:24:13.422426 543705 net.go:770] primary dev: ETH0
I0320 13:24:13.422438 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:24:13.422450 543705 net.go:698] Add success.
I0320 13:24:13.543883 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"173252a4-ebcb-46c3-bd5b-6545a3170a3b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 13:24:13.543914 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 13:24:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:24:14.455105 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:24:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0320 13:24:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:24:14.456662 543705 disk_worker.go:494] system disk:vda1
I0320 13:24:14.456691 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:24:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:24:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:24:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:24:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:24:16.472382 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:24:23.410689 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:24:23.410705 543705 memory.go:184] no items to output this cycle
I0320 13:24:23.410734 543705 cpu.go:275] no items to output this cycle
I0320 13:24:24.503189 543705 disk_info.go:125] begin check local disk info of client
I0320 13:24:24.505688 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:24:24.505694 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039f900 0xc00039f940]
E0320 13:24:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:24:33.409812 543705 memory.go:184] no items to output this cycle
I0320 13:24:33.409821 543705 cpu.go:275] no items to output this cycle
I0320 13:24:38.489199 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 13:24:38.489206 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 13:24:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:24:43.410739 543705 memory.go:191] Add success.
I0320 13:24:43.409808 543705 cpu.go:282] Add success.
I0320 13:24:43.420447 543705 net.go:648] Add success.
I0320 13:24:43.423458 543705 net.go:770] primary dev: ETH0
I0320 13:24:43.423471 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:24:43.423484 543705 net.go:698] Add success.
I0320 13:24:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:24:46.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:24:46.458055 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:24:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:24:53.409774 543705 memory.go:184] no items to output this cycle
I0320 13:24:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 13:25:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:25:03.409804 543705 memory.go:184] no items to output this cycle
I0320 13:25:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 13:25:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:25:13.409784 543705 memory.go:191] Add success.
I0320 13:25:13.409807 543705 cpu.go:282] Add success.
W0320 13:25:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:25:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:25:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:25:13.419749 543705 net.go:648] Add success.
I0320 13:25:13.422706 543705 net.go:770] primary dev: ETH0
I0320 13:25:13.422719 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:25:13.422730 543705 net.go:698] Add success.
I0320 13:25:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:25:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:25:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0320 13:25:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:25:14.456589 543705 disk_worker.go:494] system disk:vda1
I0320 13:25:14.456618 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:25:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:25:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:25:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:25:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:25:16.472460 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:25:23.410643 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:25:23.410658 543705 memory.go:184] no items to output this cycle
I0320 13:25:23.410687 543705 cpu.go:275] no items to output this cycle
I0320 13:25:24.506173 543705 disk_info.go:125] begin check local disk info of client
I0320 13:25:24.508593 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:25:24.508599 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007be80 0xc00007bec0]
E0320 13:25:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:25:33.409785 543705 memory.go:184] no items to output this cycle
I0320 13:25:33.409804 543705 cpu.go:275] no items to output this cycle
E0320 13:25:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:25:43.409813 543705 memory.go:191] Add success.
I0320 13:25:43.409820 543705 cpu.go:282] Add success.
I0320 13:25:43.420026 543705 net.go:648] Add success.
I0320 13:25:43.422815 543705 net.go:770] primary dev: ETH0
I0320 13:25:43.422829 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:25:43.422840 543705 net.go:698] Add success.
I0320 13:25:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:25:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:25:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:25:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:25:53.409782 543705 memory.go:184] no items to output this cycle
I0320 13:25:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 13:26:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:26:03.409779 543705 memory.go:184] no items to output this cycle
I0320 13:26:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 13:26:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:26:13.409823 543705 memory.go:191] Add success.
I0320 13:26:13.409832 543705 cpu.go:282] Add success.
W0320 13:26:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:26:13.409873 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:26:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:26:13.420182 543705 net.go:648] Add success.
I0320 13:26:13.422972 543705 net.go:770] primary dev: ETH0
I0320 13:26:13.422985 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:26:13.422997 543705 net.go:698] Add success.
I0320 13:26:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:26:14.455311 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:26:14.455322 543705 disk_worker.go:708] disk space is not compliant
W0320 13:26:14.455329 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:26:14.457486 543705 disk_worker.go:494] system disk:vda1
I0320 13:26:14.457529 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:26:15.455949 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:26:16.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:26:16.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:26:16.458091 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:26:16.472455 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:26:23.410256 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:26:23.410266 543705 cpu.go:275] no items to output this cycle
I0320 13:26:23.410273 543705 memory.go:184] no items to output this cycle
I0320 13:26:24.509241 543705 disk_info.go:125] begin check local disk info of client
I0320 13:26:24.511752 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:26:24.511758 543705 disk_info.go:196] parse disk info done, disk is : [0xc00046a000 0xc00046a040]
E0320 13:26:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:26:33.409792 543705 memory.go:184] no items to output this cycle
I0320 13:26:33.409809 543705 cpu.go:275] no items to output this cycle
E0320 13:26:43.410426 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:26:43.410453 543705 memory.go:191] Add success.
I0320 13:26:43.410456 543705 cpu.go:282] Add success.
I0320 13:26:43.420606 543705 net.go:648] Add success.
I0320 13:26:43.423516 543705 net.go:770] primary dev: ETH0
I0320 13:26:43.423530 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:26:43.423543 543705 net.go:698] Add success.
I0320 13:26:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:26:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:26:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:26:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:26:53.409784 543705 cpu.go:275] no items to output this cycle
I0320 13:26:53.409788 543705 memory.go:184] no items to output this cycle
E0320 13:27:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:27:03.409802 543705 memory.go:184] no items to output this cycle
I0320 13:27:03.409810 543705 cpu.go:275] no items to output this cycle
W0320 13:27:13.409704 543705 conf_downlod.go:84] cant't download conf: Get "": unsupported protocol scheme ""
W0320 13:27:13.409713 543705 conf_downlod.go:89] use old conf
E0320 13:27:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:27:13.409808 543705 memory.go:191] Add success.
I0320 13:27:13.409820 543705 cpu.go:282] Add success.
W0320 13:27:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:27:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:27:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:27:13.420252 543705 net.go:648] Add success.
I0320 13:27:13.429416 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 13:27:13.429517 543705 net.go:770] primary dev: ETH0
I0320 13:27:13.429530 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:27:13.429540 543705 net.go:698] Add success.
I0320 13:27:13.453149 543705 event_worker.go:152] Polling the log file for events...
I0320 13:27:13.468570 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ef0eca33-ec42-420c-864c-7335e4f8102a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 13:27:13.468607 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 13:27:14.455358 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:27:14.455372 543705 disk_worker.go:708] disk space is not compliant
W0320 13:27:14.455377 543705 disk_worker.go:728] disk inode is not compliant
E0320 13:27:14.456228 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 13:27:14.456991 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 13:27:14.456998 543705 custom_config.go:64] query custom config with name: gpu
I0320 13:27:14.458001 543705 disk_worker.go:494] system disk:vda1
I0320 13:27:14.458033 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 13:27:15.456801 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 13:27:15.456810 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:27:16.457943 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 13:27:16.457943 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 13:27:16.457999 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:27:16.458019 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:27:16.472358 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:27:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:27:23.409773 543705 cpu.go:275] no items to output this cycle
I0320 13:27:23.409785 543705 memory.go:184] no items to output this cycle
I0320 13:27:24.511836 543705 disk_info.go:125] begin check local disk info of client
I0320 13:27:24.514300 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:27:24.514305 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa640 0xc0001fa680]
E0320 13:27:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:27:33.409806 543705 memory.go:184] no items to output this cycle
I0320 13:27:33.409816 543705 cpu.go:275] no items to output this cycle
I0320 13:27:38.490206 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 13:27:38.490213 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 13:27:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:27:43.410742 543705 memory.go:191] Add success.
I0320 13:27:43.409791 543705 cpu.go:282] Add success.
I0320 13:27:43.420436 543705 net.go:648] Add success.
I0320 13:27:43.423440 543705 net.go:770] primary dev: ETH0
I0320 13:27:43.423453 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:27:43.423465 543705 net.go:698] Add success.
I0320 13:27:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:27:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:27:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:27:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:27:53.409779 543705 memory.go:184] no items to output this cycle
I0320 13:27:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 13:28:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:28:03.409780 543705 memory.go:184] no items to output this cycle
I0320 13:28:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 13:28:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:28:13.409811 543705 memory.go:191] Add success.
I0320 13:28:13.409820 543705 cpu.go:282] Add success.
W0320 13:28:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:28:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:28:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:28:13.420075 543705 net.go:648] Add success.
I0320 13:28:13.423210 543705 net.go:770] primary dev: ETH0
I0320 13:28:13.423227 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:28:13.423240 543705 net.go:698] Add success.
I0320 13:28:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:28:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:28:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 13:28:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:28:14.459278 543705 disk_worker.go:494] system disk:vda1
I0320 13:28:14.459307 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:28:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:28:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:28:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:28:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:28:16.472395 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:28:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:28:23.409786 543705 memory.go:184] no items to output this cycle
I0320 13:28:23.409787 543705 cpu.go:275] no items to output this cycle
I0320 13:28:24.515267 543705 disk_info.go:125] begin check local disk info of client
I0320 13:28:24.517778 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:28:24.517784 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5680 0xc0000c56c0]
E0320 13:28:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:28:33.409779 543705 memory.go:184] no items to output this cycle
I0320 13:28:33.409800 543705 cpu.go:275] no items to output this cycle
E0320 13:28:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:28:43.409798 543705 memory.go:191] Add success.
I0320 13:28:43.409821 543705 cpu.go:282] Add success.
I0320 13:28:43.419910 543705 net.go:648] Add success.
I0320 13:28:43.422480 543705 net.go:770] primary dev: ETH0
I0320 13:28:43.422496 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:28:43.422511 543705 net.go:698] Add success.
I0320 13:28:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:28:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:28:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:28:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:28:53.409792 543705 memory.go:184] no items to output this cycle
I0320 13:28:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 13:29:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:29:03.409781 543705 memory.go:184] no items to output this cycle
I0320 13:29:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 13:29:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:29:13.409781 543705 memory.go:191] Add success.
W0320 13:29:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 13:29:13.409806 543705 cpu.go:282] Add success.
W0320 13:29:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:29:13.409819 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:29:13.420158 543705 net.go:648] Add success.
I0320 13:29:13.422780 543705 net.go:770] primary dev: ETH0
I0320 13:29:13.422793 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:29:13.422805 543705 net.go:698] Add success.
I0320 13:29:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:29:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:29:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 13:29:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:29:14.456537 543705 disk_worker.go:494] system disk:vda1
I0320 13:29:14.456565 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:29:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:29:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:29:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:29:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:29:16.472399 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:29:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:29:23.409775 543705 memory.go:184] no items to output this cycle
I0320 13:29:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 13:29:24.517864 543705 disk_info.go:125] begin check local disk info of client
I0320 13:29:24.520379 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:29:24.520386 543705 disk_info.go:196] parse disk info done, disk is : [0xc000257440 0xc000257480]
E0320 13:29:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:29:33.409778 543705 memory.go:184] no items to output this cycle
I0320 13:29:33.409802 543705 cpu.go:275] no items to output this cycle
E0320 13:29:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:29:43.409805 543705 memory.go:191] Add success.
I0320 13:29:43.409819 543705 cpu.go:282] Add success.
I0320 13:29:43.419966 543705 net.go:648] Add success.
I0320 13:29:43.422632 543705 net.go:770] primary dev: ETH0
I0320 13:29:43.422651 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:29:43.422674 543705 net.go:698] Add success.
I0320 13:29:46.457999 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:29:46.458077 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:29:46.458107 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:29:53.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:29:53.409818 543705 memory.go:184] no items to output this cycle
I0320 13:29:53.409830 543705 cpu.go:275] no items to output this cycle
E0320 13:30:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:30:03.409796 543705 memory.go:184] no items to output this cycle
I0320 13:30:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 13:30:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:30:13.409794 543705 memory.go:191] Add success.
W0320 13:30:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 13:30:13.409826 543705 cpu.go:282] Add success.
W0320 13:30:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:30:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:30:13.420192 543705 net.go:648] Add success.
I0320 13:30:13.423195 543705 net.go:770] primary dev: ETH0
I0320 13:30:13.423217 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:30:13.423231 543705 net.go:698] Add success.
I0320 13:30:13.467841 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3fe4d411-a19f-4ac8-8bb5-41c7a09a8829","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 13:30:13.467874 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 13:30:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:30:14.455191 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:30:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0320 13:30:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:30:14.456589 543705 disk_worker.go:494] system disk:vda1
I0320 13:30:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:30:15.456016 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:30:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:30:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:30:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:30:16.472401 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:30:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:30:23.409777 543705 memory.go:184] no items to output this cycle
I0320 13:30:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 13:30:24.521300 543705 disk_info.go:125] begin check local disk info of client
I0320 13:30:24.523769 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:30:24.523774 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4100 0xc0000c4140]
E0320 13:30:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:30:33.409797 543705 memory.go:184] no items to output this cycle
I0320 13:30:33.409806 543705 cpu.go:275] no items to output this cycle
I0320 13:30:38.491209 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 13:30:38.491216 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 13:30:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:30:43.410654 543705 memory.go:191] Add success.
I0320 13:30:43.409806 543705 cpu.go:282] Add success.
I0320 13:30:43.420389 543705 net.go:648] Add success.
I0320 13:30:43.423270 543705 net.go:770] primary dev: ETH0
I0320 13:30:43.423284 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:30:43.423296 543705 net.go:698] Add success.
I0320 13:30:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:30:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:30:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:30:53.409802 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:30:53.409823 543705 memory.go:184] no items to output this cycle
I0320 13:30:53.409835 543705 cpu.go:275] no items to output this cycle
E0320 13:31:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:31:03.409787 543705 memory.go:184] no items to output this cycle
I0320 13:31:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 13:31:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:31:13.409822 543705 memory.go:191] Add success.
I0320 13:31:13.409832 543705 cpu.go:282] Add success.
W0320 13:31:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:31:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:31:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:31:13.420071 543705 net.go:648] Add success.
I0320 13:31:13.422906 543705 net.go:770] primary dev: ETH0
I0320 13:31:13.422919 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:31:13.422931 543705 net.go:698] Add success.
I0320 13:31:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:31:14.455198 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:31:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0320 13:31:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:31:14.456591 543705 disk_worker.go:494] system disk:vda1
I0320 13:31:14.456619 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:31:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:31:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:31:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:31:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:31:16.472374 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:31:23.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:31:23.409762 543705 memory.go:184] no items to output this cycle
I0320 13:31:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 13:31:24.524256 543705 disk_info.go:125] begin check local disk info of client
I0320 13:31:24.526692 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:31:24.526698 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002abc40 0xc0002abc80]
E0320 13:31:33.409909 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:31:33.409929 543705 memory.go:184] no items to output this cycle
I0320 13:31:33.410060 543705 cpu.go:275] no items to output this cycle
E0320 13:31:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:31:43.409798 543705 memory.go:191] Add success.
I0320 13:31:43.409808 543705 cpu.go:282] Add success.
I0320 13:31:43.420053 543705 net.go:648] Add success.
I0320 13:31:43.423171 543705 net.go:770] primary dev: ETH0
I0320 13:31:43.423184 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:31:43.423198 543705 net.go:698] Add success.
I0320 13:31:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:31:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:31:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:31:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:31:53.409801 543705 memory.go:184] no items to output this cycle
I0320 13:31:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 13:32:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:32:03.409778 543705 memory.go:184] no items to output this cycle
I0320 13:32:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 13:32:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:32:13.409787 543705 memory.go:191] Add success.
I0320 13:32:13.409793 543705 cpu.go:282] Add success.
W0320 13:32:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:32:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:32:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:32:13.420245 543705 net.go:648] Add success.
I0320 13:32:13.422838 543705 net.go:770] primary dev: ETH0
I0320 13:32:13.422852 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:32:13.422865 543705 net.go:698] Add success.
W0320 13:32:14.455154 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:32:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0320 13:32:14.455169 543705 disk_worker.go:728] disk inode is not compliant
E0320 13:32:14.456942 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 13:32:14.456953 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 13:32:14.456959 543705 custom_config.go:64] query custom config with name: gpu
I0320 13:32:14.457029 543705 disk_worker.go:494] system disk:vda1
I0320 13:32:14.457071 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 13:32:15.456805 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 13:32:15.456813 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:32:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 13:32:16.457966 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 13:32:16.458019 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:32:16.458039 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:32:16.472415 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:32:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:32:23.409774 543705 memory.go:184] no items to output this cycle
I0320 13:32:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 13:32:24.527266 543705 disk_info.go:125] begin check local disk info of client
I0320 13:32:24.529747 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:32:24.529753 543705 disk_info.go:196] parse disk info done, disk is : [0xc000217300 0xc000217340]
E0320 13:32:33.409867 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:32:33.409889 543705 memory.go:184] no items to output this cycle
I0320 13:32:33.409917 543705 cpu.go:275] no items to output this cycle
E0320 13:32:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:32:43.409786 543705 memory.go:191] Add success.
I0320 13:32:43.409797 543705 cpu.go:282] Add success.
I0320 13:32:43.420072 543705 net.go:648] Add success.
I0320 13:32:43.422817 543705 net.go:770] primary dev: ETH0
I0320 13:32:43.422830 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:32:43.422843 543705 net.go:698] Add success.
I0320 13:32:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:32:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:32:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:32:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:32:53.409782 543705 memory.go:184] no items to output this cycle
I0320 13:32:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 13:33:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:33:03.409771 543705 memory.go:184] no items to output this cycle
I0320 13:33:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 13:33:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:33:13.409813 543705 memory.go:191] Add success.
I0320 13:33:13.409823 543705 cpu.go:282] Add success.
W0320 13:33:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:33:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:33:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:33:13.420161 543705 net.go:648] Add success.
I0320 13:33:13.423224 543705 net.go:770] primary dev: ETH0
I0320 13:33:13.423237 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:33:13.423248 543705 net.go:698] Add success.
I0320 13:33:13.463697 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9ef1b5b2-447f-41c8-b601-a0dd34fce657","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 13:33:13.463732 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 13:33:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:33:14.455119 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:33:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 13:33:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:33:14.456594 543705 disk_worker.go:494] system disk:vda1
I0320 13:33:14.456627 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:33:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:33:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:33:16.458023 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:33:16.458046 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:33:16.472385 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:33:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:33:23.409764 543705 memory.go:184] no items to output this cycle
I0320 13:33:23.409798 543705 cpu.go:275] no items to output this cycle
I0320 13:33:24.530294 543705 disk_info.go:125] begin check local disk info of client
I0320 13:33:24.532727 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:33:24.532732 543705 disk_info.go:196] parse disk info done, disk is : [0xc000265900 0xc000265940]
I0320 13:33:33.409931 543705 cpu.go:275] no items to output this cycle
E0320 13:33:33.409969 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:33:33.409985 543705 memory.go:184] no items to output this cycle
I0320 13:33:38.492221 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 13:33:38.492228 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 13:33:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:33:43.410558 543705 memory.go:191] Add success.
I0320 13:33:43.409813 543705 cpu.go:282] Add success.
I0320 13:33:43.420290 543705 net.go:648] Add success.
I0320 13:33:43.422776 543705 net.go:770] primary dev: ETH0
I0320 13:33:43.422791 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:33:43.422803 543705 net.go:698] Add success.
I0320 13:33:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:33:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:33:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:33:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:33:53.409785 543705 memory.go:184] no items to output this cycle
I0320 13:33:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 13:34:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:34:03.409769 543705 memory.go:184] no items to output this cycle
I0320 13:34:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 13:34:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:34:13.409800 543705 memory.go:191] Add success.
I0320 13:34:13.409799 543705 cpu.go:282] Add success.
W0320 13:34:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:34:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:34:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:34:13.420106 543705 net.go:648] Add success.
I0320 13:34:13.422756 543705 net.go:770] primary dev: ETH0
I0320 13:34:13.422768 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:34:13.422780 543705 net.go:698] Add success.
I0320 13:34:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:34:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:34:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 13:34:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:34:14.456582 543705 disk_worker.go:494] system disk:vda1
I0320 13:34:14.456612 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:34:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:34:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:34:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:34:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:34:16.472381 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:34:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:34:23.409766 543705 memory.go:184] no items to output this cycle
I0320 13:34:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 13:34:24.532810 543705 disk_info.go:125] begin check local disk info of client
I0320 13:34:24.535296 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:34:24.535302 543705 disk_info.go:196] parse disk info done, disk is : [0xc000395d00 0xc000395d40]
E0320 13:34:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:34:33.409776 543705 memory.go:184] no items to output this cycle
I0320 13:34:33.409804 543705 cpu.go:275] no items to output this cycle
E0320 13:34:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:34:43.409789 543705 memory.go:191] Add success.
I0320 13:34:43.409817 543705 cpu.go:282] Add success.
I0320 13:34:43.419883 543705 net.go:648] Add success.
I0320 13:34:43.422818 543705 net.go:770] primary dev: ETH0
I0320 13:34:43.422832 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:34:43.422843 543705 net.go:698] Add success.
I0320 13:34:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:34:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:34:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:34:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:34:53.409774 543705 memory.go:184] no items to output this cycle
I0320 13:34:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 13:35:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:35:03.409804 543705 memory.go:184] no items to output this cycle
I0320 13:35:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 13:35:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:35:13.409785 543705 memory.go:191] Add success.
I0320 13:35:13.409805 543705 cpu.go:282] Add success.
W0320 13:35:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:35:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:35:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:35:13.420181 543705 net.go:648] Add success.
I0320 13:35:13.422759 543705 net.go:770] primary dev: ETH0
I0320 13:35:13.422773 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:35:13.422788 543705 net.go:698] Add success.
I0320 13:35:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:35:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:35:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0320 13:35:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:35:14.456516 543705 disk_worker.go:494] system disk:vda1
I0320 13:35:14.456561 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:35:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:35:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:35:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:35:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:35:16.472375 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:35:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:35:23.409777 543705 memory.go:184] no items to output this cycle
I0320 13:35:23.409777 543705 cpu.go:275] no items to output this cycle
I0320 13:35:24.536374 543705 disk_info.go:125] begin check local disk info of client
I0320 13:35:24.538855 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:35:24.538861 543705 disk_info.go:196] parse disk info done, disk is : [0xc00027ae40 0xc00027ae80]
E0320 13:35:33.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:35:33.409813 543705 memory.go:184] no items to output this cycle
I0320 13:35:33.409824 543705 cpu.go:275] no items to output this cycle
E0320 13:35:43.409874 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:35:43.409907 543705 memory.go:191] Add success.
I0320 13:35:43.409958 543705 cpu.go:282] Add success.
I0320 13:35:43.419721 543705 net.go:648] Add success.
I0320 13:35:43.422361 543705 net.go:770] primary dev: ETH0
I0320 13:35:43.422376 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:35:43.422391 543705 net.go:698] Add success.
I0320 13:35:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:35:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:35:46.458090 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:35:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:35:53.409810 543705 memory.go:184] no items to output this cycle
I0320 13:35:53.409821 543705 cpu.go:275] no items to output this cycle
E0320 13:36:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:36:03.409774 543705 memory.go:184] no items to output this cycle
I0320 13:36:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 13:36:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:36:13.409822 543705 memory.go:191] Add success.
I0320 13:36:13.409833 543705 cpu.go:282] Add success.
W0320 13:36:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:36:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:36:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:36:13.420272 543705 net.go:648] Add success.
I0320 13:36:13.423048 543705 net.go:770] primary dev: ETH0
I0320 13:36:13.423061 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:36:13.423072 543705 net.go:698] Add success.
I0320 13:36:13.470185 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c7f3e158-8136-44b3-868d-9bb724bf87cf","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 13:36:13.470226 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 13:36:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:36:14.455107 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:36:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0320 13:36:14.455171 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:36:14.456520 543705 disk_worker.go:494] system disk:vda1
I0320 13:36:14.456565 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:36:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:36:16.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:36:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:36:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:36:16.472404 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:36:23.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:36:23.409810 543705 memory.go:184] no items to output this cycle
I0320 13:36:23.409820 543705 cpu.go:275] no items to output this cycle
I0320 13:36:24.538942 543705 disk_info.go:125] begin check local disk info of client
I0320 13:36:24.541362 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:36:24.541367 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002586c0 0xc000258700]
E0320 13:36:33.409804 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:36:33.409829 543705 memory.go:184] no items to output this cycle
I0320 13:36:33.409845 543705 cpu.go:275] no items to output this cycle
I0320 13:36:38.493218 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 13:36:38.493225 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 13:36:43.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:36:43.410596 543705 memory.go:191] Add success.
I0320 13:36:43.409842 543705 cpu.go:282] Add success.
I0320 13:36:43.420518 543705 net.go:648] Add success.
I0320 13:36:43.423095 543705 net.go:770] primary dev: ETH0
I0320 13:36:43.423109 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:36:43.423120 543705 net.go:698] Add success.
I0320 13:36:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:36:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:36:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:36:53.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:36:53.409819 543705 memory.go:184] no items to output this cycle
I0320 13:36:53.409830 543705 cpu.go:275] no items to output this cycle
E0320 13:37:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:37:03.409807 543705 memory.go:184] no items to output this cycle
I0320 13:37:03.409822 543705 cpu.go:275] no items to output this cycle
E0320 13:37:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:37:13.409797 543705 memory.go:191] Add success.
I0320 13:37:13.409803 543705 cpu.go:282] Add success.
W0320 13:37:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:37:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:37:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:37:13.420037 543705 net.go:648] Add success.
I0320 13:37:13.423014 543705 net.go:770] primary dev: ETH0
I0320 13:37:13.423027 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:37:13.423040 543705 net.go:698] Add success.
I0320 13:37:13.453663 543705 event_worker.go:152] Polling the log file for events...
W0320 13:37:14.455151 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:37:14.455163 543705 disk_worker.go:708] disk space is not compliant
W0320 13:37:14.455166 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:37:14.456757 543705 disk_worker.go:494] system disk:vda1
I0320 13:37:14.456809 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 13:37:14.457108 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 13:37:14.457116 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 13:37:14.457120 543705 custom_config.go:64] query custom config with name: gpu
E0320 13:37:15.456872 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 13:37:15.456880 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:37:16.457913 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 13:37:16.457913 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 13:37:16.457968 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:37:16.457988 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:37:16.472318 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:37:23.410273 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:37:23.410296 543705 memory.go:184] no items to output this cycle
I0320 13:37:23.410295 543705 cpu.go:275] no items to output this cycle
I0320 13:37:24.542402 543705 disk_info.go:125] begin check local disk info of client
I0320 13:37:24.544819 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:37:24.544825 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c57c0 0xc0000c5800]
E0320 13:37:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:37:33.409821 543705 memory.go:184] no items to output this cycle
I0320 13:37:33.409828 543705 cpu.go:275] no items to output this cycle
E0320 13:37:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:37:43.409808 543705 memory.go:191] Add success.
I0320 13:37:43.409818 543705 cpu.go:282] Add success.
I0320 13:37:43.420045 543705 net.go:648] Add success.
I0320 13:37:43.422617 543705 net.go:770] primary dev: ETH0
I0320 13:37:43.422630 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:37:43.422642 543705 net.go:698] Add success.
I0320 13:37:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:37:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:37:46.458090 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:37:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:37:53.409785 543705 memory.go:184] no items to output this cycle
I0320 13:37:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 13:38:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:38:03.409782 543705 memory.go:184] no items to output this cycle
I0320 13:38:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 13:38:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:38:13.409805 543705 memory.go:191] Add success.
I0320 13:38:13.409815 543705 cpu.go:282] Add success.
W0320 13:38:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:38:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:38:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:38:13.420156 543705 net.go:648] Add success.
I0320 13:38:13.422660 543705 net.go:770] primary dev: ETH0
I0320 13:38:13.422674 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:38:13.422686 543705 net.go:698] Add success.
I0320 13:38:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:38:14.455168 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:38:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0320 13:38:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:38:14.456556 543705 disk_worker.go:494] system disk:vda1
I0320 13:38:14.456584 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:38:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:38:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:38:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:38:16.458081 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:38:16.472468 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:38:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:38:23.409781 543705 cpu.go:275] no items to output this cycle
I0320 13:38:23.409790 543705 memory.go:184] no items to output this cycle
I0320 13:38:24.545368 543705 disk_info.go:125] begin check local disk info of client
I0320 13:38:24.547862 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:38:24.547868 543705 disk_info.go:196] parse disk info done, disk is : [0xc000353300 0xc000353340]
E0320 13:38:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:38:33.409807 543705 memory.go:184] no items to output this cycle
I0320 13:38:33.409820 543705 cpu.go:275] no items to output this cycle
E0320 13:38:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:38:43.409793 543705 memory.go:191] Add success.
I0320 13:38:43.409796 543705 cpu.go:282] Add success.
I0320 13:38:43.419758 543705 net.go:648] Add success.
I0320 13:38:43.422487 543705 net.go:770] primary dev: ETH0
I0320 13:38:43.422503 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:38:43.422518 543705 net.go:698] Add success.
I0320 13:38:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:38:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:38:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:38:53.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:38:53.409812 543705 memory.go:184] no items to output this cycle
I0320 13:38:53.409823 543705 cpu.go:275] no items to output this cycle
E0320 13:39:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:39:03.409783 543705 memory.go:184] no items to output this cycle
I0320 13:39:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 13:39:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:39:13.409807 543705 memory.go:191] Add success.
I0320 13:39:13.409815 543705 cpu.go:282] Add success.
W0320 13:39:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:39:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:39:13.409859 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:39:13.420086 543705 net.go:648] Add success.
I0320 13:39:13.422695 543705 net.go:770] primary dev: ETH0
I0320 13:39:13.422707 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:39:13.422719 543705 net.go:698] Add success.
I0320 13:39:13.468325 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ba69651e-b19b-4076-ac31-367afe829ae5","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 13:39:13.468359 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 13:39:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:39:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:39:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0320 13:39:14.455173 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:39:14.456501 543705 disk_worker.go:494] system disk:vda1
I0320 13:39:14.456546 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:39:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:39:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:39:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:39:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:39:16.472400 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:39:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:39:23.409792 543705 memory.go:184] no items to output this cycle
I0320 13:39:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 13:39:24.548382 543705 disk_info.go:125] begin check local disk info of client
I0320 13:39:24.550830 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:39:24.550836 543705 disk_info.go:196] parse disk info done, disk is : [0xc000352e00 0xc000352e40]
E0320 13:39:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:39:33.409811 543705 memory.go:184] no items to output this cycle
I0320 13:39:33.409825 543705 cpu.go:275] no items to output this cycle
I0320 13:39:38.494232 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 13:39:38.494240 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 13:39:43.409875 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:39:43.410673 543705 memory.go:191] Add success.
I0320 13:39:43.409983 543705 cpu.go:282] Add success.
I0320 13:39:43.419708 543705 net.go:648] Add success.
I0320 13:39:43.422543 543705 net.go:770] primary dev: ETH0
I0320 13:39:43.422556 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:39:43.422569 543705 net.go:698] Add success.
I0320 13:39:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:39:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:39:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:39:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:39:53.409809 543705 memory.go:184] no items to output this cycle
I0320 13:39:53.409820 543705 cpu.go:275] no items to output this cycle
E0320 13:40:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:40:03.409764 543705 memory.go:184] no items to output this cycle
I0320 13:40:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 13:40:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:40:13.409806 543705 memory.go:191] Add success.
I0320 13:40:13.409816 543705 cpu.go:282] Add success.
W0320 13:40:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:40:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:40:13.409859 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:40:13.420063 543705 net.go:648] Add success.
I0320 13:40:13.422518 543705 net.go:770] primary dev: ETH0
I0320 13:40:13.422531 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:40:13.422543 543705 net.go:698] Add success.
I0320 13:40:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:40:14.455187 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:40:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 13:40:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:40:14.456581 543705 disk_worker.go:494] system disk:vda1
I0320 13:40:14.456612 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:40:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:40:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:40:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:40:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:40:16.472389 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:40:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:40:23.409793 543705 memory.go:184] no items to output this cycle
I0320 13:40:23.409807 543705 cpu.go:275] no items to output this cycle
I0320 13:40:24.550915 543705 disk_info.go:125] begin check local disk info of client
I0320 13:40:24.553357 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:40:24.553365 543705 disk_info.go:196] parse disk info done, disk is : [0xc000273dc0 0xc000273e00]
E0320 13:40:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:40:33.409806 543705 memory.go:184] no items to output this cycle
I0320 13:40:33.409821 543705 cpu.go:275] no items to output this cycle
E0320 13:40:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:40:43.409786 543705 memory.go:191] Add success.
I0320 13:40:43.409803 543705 cpu.go:282] Add success.
I0320 13:40:43.419915 543705 net.go:648] Add success.
I0320 13:40:43.422650 543705 net.go:770] primary dev: ETH0
I0320 13:40:43.422664 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:40:43.422677 543705 net.go:698] Add success.
I0320 13:40:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:40:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:40:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:40:53.410241 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:40:53.410261 543705 memory.go:184] no items to output this cycle
I0320 13:40:53.410283 543705 cpu.go:275] no items to output this cycle
E0320 13:41:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:41:03.409775 543705 memory.go:184] no items to output this cycle
I0320 13:41:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 13:41:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:41:13.409781 543705 memory.go:191] Add success.
I0320 13:41:13.409799 543705 cpu.go:282] Add success.
W0320 13:41:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:41:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:41:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:41:13.420097 543705 net.go:648] Add success.
I0320 13:41:13.422966 543705 net.go:770] primary dev: ETH0
I0320 13:41:13.422979 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:41:13.422992 543705 net.go:698] Add success.
I0320 13:41:14.454953 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:41:14.455206 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:41:14.455216 543705 disk_worker.go:708] disk space is not compliant
W0320 13:41:14.455218 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:41:14.456607 543705 disk_worker.go:494] system disk:vda1
I0320 13:41:14.456636 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:41:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:41:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:41:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:41:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:41:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:41:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:41:23.409773 543705 memory.go:184] no items to output this cycle
I0320 13:41:23.409787 543705 cpu.go:275] no items to output this cycle
I0320 13:41:24.553402 543705 disk_info.go:125] begin check local disk info of client
I0320 13:41:24.555879 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:41:24.555885 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004861c0 0xc000486200]
E0320 13:41:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:41:33.409803 543705 memory.go:184] no items to output this cycle
I0320 13:41:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 13:41:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:41:43.409798 543705 memory.go:191] Add success.
I0320 13:41:43.409819 543705 cpu.go:282] Add success.
I0320 13:41:43.419995 543705 net.go:648] Add success.
I0320 13:41:43.422829 543705 net.go:770] primary dev: ETH0
I0320 13:41:43.422843 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:41:43.422855 543705 net.go:698] Add success.
I0320 13:41:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:41:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:41:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:41:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:41:53.409806 543705 memory.go:184] no items to output this cycle
I0320 13:41:53.409819 543705 cpu.go:275] no items to output this cycle
E0320 13:42:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:42:03.409797 543705 memory.go:184] no items to output this cycle
I0320 13:42:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 13:42:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:42:13.409788 543705 memory.go:191] Add success.
I0320 13:42:13.409790 543705 cpu.go:282] Add success.
W0320 13:42:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:42:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:42:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:42:13.420151 543705 net.go:648] Add success.
I0320 13:42:13.423117 543705 net.go:770] primary dev: ETH0
I0320 13:42:13.423131 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:42:13.423145 543705 net.go:698] Add success.
I0320 13:42:13.486300 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8b2b0e00-a602-4c58-9ba8-ac7dab9c72db","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 13:42:13.486335 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 13:42:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:42:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0320 13:42:14.455172 543705 disk_worker.go:728] disk inode is not compliant
E0320 13:42:14.455908 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 13:42:14.455917 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 13:42:14.455922 543705 custom_config.go:64] query custom config with name: gpu
I0320 13:42:14.456558 543705 disk_worker.go:494] system disk:vda1
I0320 13:42:14.456591 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 13:42:15.456805 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 13:42:15.456813 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:42:16.457938 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 13:42:16.457938 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 13:42:16.457992 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:42:16.458011 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:42:16.472380 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:42:23.410270 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:42:23.410289 543705 memory.go:184] no items to output this cycle
I0320 13:42:23.410307 543705 cpu.go:275] no items to output this cycle
I0320 13:42:24.556420 543705 disk_info.go:125] begin check local disk info of client
I0320 13:42:24.558870 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:42:24.558875 543705 disk_info.go:196] parse disk info done, disk is : [0xc000377200 0xc000377240]
E0320 13:42:33.409854 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:42:33.409875 543705 memory.go:184] no items to output this cycle
I0320 13:42:33.409958 543705 cpu.go:275] no items to output this cycle
I0320 13:42:38.495236 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 13:42:38.495242 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 13:42:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:42:43.410520 543705 memory.go:191] Add success.
I0320 13:42:43.409822 543705 cpu.go:282] Add success.
I0320 13:42:43.420202 543705 net.go:648] Add success.
I0320 13:42:43.423091 543705 net.go:770] primary dev: ETH0
I0320 13:42:43.423104 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:42:43.423115 543705 net.go:698] Add success.
I0320 13:42:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:42:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:42:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:42:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:42:53.409780 543705 memory.go:184] no items to output this cycle
I0320 13:42:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 13:43:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:43:03.409772 543705 memory.go:184] no items to output this cycle
I0320 13:43:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 13:43:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:43:13.409792 543705 memory.go:191] Add success.
I0320 13:43:13.409792 543705 cpu.go:282] Add success.
W0320 13:43:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:43:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:43:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:43:13.420159 543705 net.go:648] Add success.
I0320 13:43:13.423255 543705 net.go:770] primary dev: ETH0
I0320 13:43:13.423268 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:43:13.423290 543705 net.go:698] Add success.
I0320 13:43:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:43:14.455153 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:43:14.455164 543705 disk_worker.go:708] disk space is not compliant
W0320 13:43:14.455167 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:43:14.456512 543705 disk_worker.go:494] system disk:vda1
I0320 13:43:14.456558 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:43:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:43:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:43:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:43:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:43:16.472366 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:43:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:43:23.409773 543705 memory.go:184] no items to output this cycle
I0320 13:43:23.409798 543705 cpu.go:275] no items to output this cycle
I0320 13:43:24.558953 543705 disk_info.go:125] begin check local disk info of client
I0320 13:43:24.561522 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:43:24.561528 543705 disk_info.go:196] parse disk info done, disk is : [0xc000385d40 0xc000385d80]
E0320 13:43:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:43:33.409815 543705 memory.go:184] no items to output this cycle
I0320 13:43:33.409829 543705 cpu.go:275] no items to output this cycle
E0320 13:43:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:43:43.409793 543705 memory.go:191] Add success.
I0320 13:43:43.409808 543705 cpu.go:282] Add success.
I0320 13:43:43.420067 543705 net.go:648] Add success.
I0320 13:43:43.422744 543705 net.go:770] primary dev: ETH0
I0320 13:43:43.422757 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:43:43.422770 543705 net.go:698] Add success.
I0320 13:43:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:43:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:43:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:43:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:43:53.409775 543705 memory.go:184] no items to output this cycle
I0320 13:43:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 13:44:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:44:03.409769 543705 memory.go:184] no items to output this cycle
I0320 13:44:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 13:44:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:44:13.409779 543705 memory.go:191] Add success.
I0320 13:44:13.409798 543705 cpu.go:282] Add success.
W0320 13:44:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:44:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:44:13.412388 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:44:13.419987 543705 net.go:648] Add success.
I0320 13:44:13.421664 543705 net.go:770] primary dev: ETH0
I0320 13:44:13.421677 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:44:13.421690 543705 net.go:698] Add success.
I0320 13:44:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:44:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:44:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0320 13:44:14.455218 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:44:14.456603 543705 disk_worker.go:494] system disk:vda1
I0320 13:44:14.456635 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:44:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:44:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:44:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:44:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:44:16.472374 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:44:23.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:44:23.409812 543705 memory.go:184] no items to output this cycle
I0320 13:44:23.409816 543705 cpu.go:275] no items to output this cycle
I0320 13:44:24.562498 543705 disk_info.go:125] begin check local disk info of client
I0320 13:44:24.564935 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:44:24.564941 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb480 0xc0001fb4c0]
E0320 13:44:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:44:33.409787 543705 cpu.go:275] no items to output this cycle
I0320 13:44:33.409790 543705 memory.go:184] no items to output this cycle
E0320 13:44:43.409894 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:44:43.409924 543705 memory.go:191] Add success.
I0320 13:44:43.410058 543705 cpu.go:282] Add success.
I0320 13:44:43.419735 543705 net.go:648] Add success.
I0320 13:44:43.422321 543705 net.go:770] primary dev: ETH0
I0320 13:44:43.422337 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:44:43.422350 543705 net.go:698] Add success.
I0320 13:44:46.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:44:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:44:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:44:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:44:53.409772 543705 memory.go:184] no items to output this cycle
I0320 13:44:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 13:45:03.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:45:03.409760 543705 memory.go:184] no items to output this cycle
I0320 13:45:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 13:45:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:45:13.409812 543705 memory.go:191] Add success.
I0320 13:45:13.409829 543705 cpu.go:282] Add success.
W0320 13:45:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:45:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:45:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:45:13.420130 543705 net.go:648] Add success.
I0320 13:45:13.422760 543705 net.go:770] primary dev: ETH0
I0320 13:45:13.422772 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:45:13.422786 543705 net.go:698] Add success.
I0320 13:45:13.492277 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"6a03445c-11d3-4262-91d8-e16a49e56560","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 13:45:13.492314 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 13:45:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:45:14.455191 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:45:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 13:45:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:45:14.456591 543705 disk_worker.go:494] system disk:vda1
I0320 13:45:14.456621 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:45:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:45:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:45:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:45:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:45:16.472405 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:45:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:45:23.409777 543705 memory.go:184] no items to output this cycle
I0320 13:45:23.409780 543705 cpu.go:275] no items to output this cycle
I0320 13:45:24.565469 543705 disk_info.go:125] begin check local disk info of client
I0320 13:45:24.567893 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:45:24.567898 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c54c0 0xc0000c5500]
E0320 13:45:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:45:33.409799 543705 memory.go:184] no items to output this cycle
I0320 13:45:33.409802 543705 cpu.go:275] no items to output this cycle
I0320 13:45:38.496229 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 13:45:38.496235 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 13:45:43.409862 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:45:43.410706 543705 memory.go:191] Add success.
I0320 13:45:43.410024 543705 cpu.go:282] Add success.
I0320 13:45:43.419716 543705 net.go:648] Add success.
I0320 13:45:43.422228 543705 net.go:770] primary dev: ETH0
I0320 13:45:43.422241 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:45:43.422253 543705 net.go:698] Add success.
I0320 13:45:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:45:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:45:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:45:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:45:53.409781 543705 memory.go:184] no items to output this cycle
I0320 13:45:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 13:46:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:46:03.409795 543705 memory.go:184] no items to output this cycle
I0320 13:46:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 13:46:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:46:13.409783 543705 memory.go:191] Add success.
I0320 13:46:13.409803 543705 cpu.go:282] Add success.
W0320 13:46:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:46:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:46:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:46:13.420055 543705 net.go:648] Add success.
I0320 13:46:13.423013 543705 net.go:770] primary dev: ETH0
I0320 13:46:13.423027 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:46:13.423040 543705 net.go:698] Add success.
I0320 13:46:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:46:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:46:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 13:46:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:46:14.456591 543705 disk_worker.go:494] system disk:vda1
I0320 13:46:14.456622 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:46:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:46:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:46:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:46:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:46:16.472379 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:46:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:46:23.409794 543705 memory.go:184] no items to output this cycle
I0320 13:46:23.409807 543705 cpu.go:275] no items to output this cycle
I0320 13:46:24.568481 543705 disk_info.go:125] begin check local disk info of client
I0320 13:46:24.570963 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:46:24.570969 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032eb00 0xc00032eb40]
E0320 13:46:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:46:33.409807 543705 memory.go:184] no items to output this cycle
I0320 13:46:33.409822 543705 cpu.go:275] no items to output this cycle
E0320 13:46:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:46:43.409793 543705 memory.go:191] Add success.
I0320 13:46:43.409796 543705 cpu.go:282] Add success.
I0320 13:46:43.420172 543705 net.go:648] Add success.
I0320 13:46:43.422768 543705 net.go:770] primary dev: ETH0
I0320 13:46:43.422781 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:46:43.422792 543705 net.go:698] Add success.
I0320 13:46:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:46:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:46:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:46:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:46:53.409772 543705 memory.go:184] no items to output this cycle
I0320 13:46:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 13:47:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:47:03.409797 543705 memory.go:184] no items to output this cycle
I0320 13:47:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 13:47:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:47:13.409786 543705 memory.go:191] Add success.
I0320 13:47:13.409788 543705 cpu.go:282] Add success.
W0320 13:47:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:47:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:47:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:47:13.420246 543705 net.go:648] Add success.
I0320 13:47:13.423257 543705 net.go:770] primary dev: ETH0
I0320 13:47:13.423270 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:47:13.423283 543705 net.go:698] Add success.
I0320 13:47:13.452774 543705 event_worker.go:152] Polling the log file for events...
W0320 13:47:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:47:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 13:47:14.455186 543705 disk_worker.go:728] disk inode is not compliant
E0320 13:47:14.456784 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 13:47:14.456794 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 13:47:14.456800 543705 custom_config.go:64] query custom config with name: gpu
I0320 13:47:14.456842 543705 disk_worker.go:494] system disk:vda1
I0320 13:47:14.456885 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 13:47:15.456860 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 13:47:15.456869 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:47:16.457950 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 13:47:16.457959 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 13:47:16.458001 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:47:16.458021 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:47:16.472346 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:47:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:47:23.409783 543705 memory.go:184] no items to output this cycle
I0320 13:47:23.409782 543705 cpu.go:275] no items to output this cycle
I0320 13:47:24.571047 543705 disk_info.go:125] begin check local disk info of client
I0320 13:47:24.573449 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:47:24.573455 543705 disk_info.go:196] parse disk info done, disk is : [0xc000395780 0xc0003957c0]
E0320 13:47:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:47:33.409774 543705 memory.go:184] no items to output this cycle
I0320 13:47:33.409804 543705 cpu.go:275] no items to output this cycle
E0320 13:47:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:47:43.409811 543705 memory.go:191] Add success.
I0320 13:47:43.409822 543705 cpu.go:282] Add success.
I0320 13:47:43.419998 543705 net.go:648] Add success.
I0320 13:47:43.422609 543705 net.go:770] primary dev: ETH0
I0320 13:47:43.422643 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:47:43.422657 543705 net.go:698] Add success.
I0320 13:47:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:47:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:47:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:47:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:47:53.409784 543705 memory.go:184] no items to output this cycle
I0320 13:47:53.409784 543705 cpu.go:275] no items to output this cycle
E0320 13:48:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:48:03.409775 543705 memory.go:184] no items to output this cycle
I0320 13:48:03.409780 543705 cpu.go:275] no items to output this cycle
E0320 13:48:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:48:13.409786 543705 cpu.go:282] Add success.
I0320 13:48:13.409800 543705 memory.go:191] Add success.
W0320 13:48:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:48:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:48:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:48:13.420048 543705 net.go:648] Add success.
I0320 13:48:13.423039 543705 net.go:770] primary dev: ETH0
I0320 13:48:13.423055 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:48:13.423069 543705 net.go:698] Add success.
I0320 13:48:13.469041 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"716ee91f-db02-4c88-9117-b82e41292d82","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 13:48:13.469076 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 13:48:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:48:14.455102 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:48:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 13:48:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:48:14.456506 543705 disk_worker.go:494] system disk:vda1
I0320 13:48:14.456550 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:48:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:48:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:48:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:48:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:48:16.472435 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:48:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:48:23.409773 543705 memory.go:184] no items to output this cycle
I0320 13:48:23.409794 543705 cpu.go:275] no items to output this cycle
I0320 13:48:24.573505 543705 disk_info.go:125] begin check local disk info of client
I0320 13:48:24.575971 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:48:24.575977 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4b80 0xc0000c4bc0]
E0320 13:48:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:48:33.409802 543705 memory.go:184] no items to output this cycle
I0320 13:48:33.409823 543705 cpu.go:275] no items to output this cycle
I0320 13:48:38.497241 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 13:48:38.497248 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 13:48:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:48:43.410628 543705 memory.go:191] Add success.
I0320 13:48:43.409826 543705 cpu.go:282] Add success.
I0320 13:48:43.420350 543705 net.go:648] Add success.
I0320 13:48:43.422981 543705 net.go:770] primary dev: ETH0
I0320 13:48:43.422994 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:48:43.423009 543705 net.go:698] Add success.
I0320 13:48:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:48:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:48:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:48:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:48:53.409787 543705 memory.go:184] no items to output this cycle
I0320 13:48:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 13:49:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:49:03.409783 543705 cpu.go:275] no items to output this cycle
I0320 13:49:03.409788 543705 memory.go:184] no items to output this cycle
E0320 13:49:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:49:13.409801 543705 memory.go:191] Add success.
I0320 13:49:13.409801 543705 cpu.go:282] Add success.
W0320 13:49:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:49:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:49:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:49:13.420127 543705 net.go:648] Add success.
I0320 13:49:13.422781 543705 net.go:770] primary dev: ETH0
I0320 13:49:13.422793 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:49:13.422804 543705 net.go:698] Add success.
I0320 13:49:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:49:14.455106 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:49:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 13:49:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:49:14.456548 543705 disk_worker.go:494] system disk:vda1
I0320 13:49:14.456578 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:49:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:49:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:49:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:49:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:49:16.472426 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:49:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:49:23.409777 543705 memory.go:184] no items to output this cycle
I0320 13:49:23.409787 543705 cpu.go:275] no items to output this cycle
I0320 13:49:24.576511 543705 disk_info.go:125] begin check local disk info of client
I0320 13:49:24.579001 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:49:24.579007 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aaa80 0xc0001aaac0]
E0320 13:49:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:49:33.409797 543705 memory.go:184] no items to output this cycle
I0320 13:49:33.409799 543705 cpu.go:275] no items to output this cycle
E0320 13:49:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:49:43.409802 543705 memory.go:191] Add success.
I0320 13:49:43.409803 543705 cpu.go:282] Add success.
I0320 13:49:43.419869 543705 net.go:648] Add success.
I0320 13:49:43.422804 543705 net.go:770] primary dev: ETH0
I0320 13:49:43.422818 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:49:43.422832 543705 net.go:698] Add success.
I0320 13:49:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:49:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:49:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:49:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:49:53.409789 543705 memory.go:184] no items to output this cycle
I0320 13:49:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 13:50:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:50:03.409879 543705 cpu.go:275] no items to output this cycle
I0320 13:50:03.409886 543705 memory.go:184] no items to output this cycle
E0320 13:50:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:50:13.409826 543705 memory.go:191] Add success.
I0320 13:50:13.409832 543705 cpu.go:282] Add success.
W0320 13:50:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:50:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:50:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:50:13.420158 543705 net.go:648] Add success.
I0320 13:50:13.422990 543705 net.go:770] primary dev: ETH0
I0320 13:50:13.423003 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:50:13.423015 543705 net.go:698] Add success.
I0320 13:50:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:50:14.455106 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:50:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 13:50:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:50:14.456557 543705 disk_worker.go:494] system disk:vda1
I0320 13:50:14.456586 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:50:15.455977 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:50:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:50:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:50:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:50:16.472394 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:50:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:50:23.409795 543705 memory.go:184] no items to output this cycle
I0320 13:50:23.409798 543705 cpu.go:275] no items to output this cycle
I0320 13:50:24.579541 543705 disk_info.go:125] begin check local disk info of client
I0320 13:50:24.582059 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:50:24.582064 543705 disk_info.go:196] parse disk info done, disk is : [0xc000551480 0xc0005514c0]
E0320 13:50:33.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:50:33.409816 543705 memory.go:184] no items to output this cycle
I0320 13:50:33.409828 543705 cpu.go:275] no items to output this cycle
E0320 13:50:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:50:43.409800 543705 memory.go:191] Add success.
I0320 13:50:43.409803 543705 cpu.go:282] Add success.
I0320 13:50:43.419896 543705 net.go:648] Add success.
I0320 13:50:43.422907 543705 net.go:770] primary dev: ETH0
I0320 13:50:43.422920 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:50:43.422932 543705 net.go:698] Add success.
I0320 13:50:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:50:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:50:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:50:53.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:50:53.409809 543705 memory.go:184] no items to output this cycle
I0320 13:50:53.409818 543705 cpu.go:275] no items to output this cycle
E0320 13:51:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:51:03.409771 543705 memory.go:184] no items to output this cycle
I0320 13:51:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 13:51:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:51:13.409781 543705 memory.go:191] Add success.
I0320 13:51:13.409802 543705 cpu.go:282] Add success.
W0320 13:51:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:51:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:51:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:51:13.420260 543705 net.go:648] Add success.
I0320 13:51:13.423429 543705 net.go:770] primary dev: ETH0
I0320 13:51:13.423443 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:51:13.423454 543705 net.go:698] Add success.
I0320 13:51:13.463884 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"430d040e-5689-4914-8eee-5f9a935f571b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 13:51:13.463915 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 13:51:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:51:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:51:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0320 13:51:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:51:14.456585 543705 disk_worker.go:494] system disk:vda1
I0320 13:51:14.456615 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:51:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:51:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:51:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:51:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:51:16.472413 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:51:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:51:23.409775 543705 memory.go:184] no items to output this cycle
I0320 13:51:23.409776 543705 cpu.go:275] no items to output this cycle
I0320 13:51:24.582541 543705 disk_info.go:125] begin check local disk info of client
I0320 13:51:24.585072 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:51:24.585078 543705 disk_info.go:196] parse disk info done, disk is : [0xc000465740 0xc000465780]
E0320 13:51:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:51:33.409786 543705 memory.go:184] no items to output this cycle
I0320 13:51:33.409789 543705 cpu.go:275] no items to output this cycle
I0320 13:51:38.498239 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 13:51:38.498246 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 13:51:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:51:43.410719 543705 memory.go:191] Add success.
I0320 13:51:43.409799 543705 cpu.go:282] Add success.
I0320 13:51:43.420448 543705 net.go:648] Add success.
I0320 13:51:43.423588 543705 net.go:770] primary dev: ETH0
I0320 13:51:43.423602 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:51:43.423614 543705 net.go:698] Add success.
I0320 13:51:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:51:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:51:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:51:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:51:53.409787 543705 memory.go:184] no items to output this cycle
I0320 13:51:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 13:52:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:52:03.409783 543705 memory.go:184] no items to output this cycle
I0320 13:52:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 13:52:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:52:13.409819 543705 memory.go:191] Add success.
I0320 13:52:13.409820 543705 cpu.go:282] Add success.
W0320 13:52:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:52:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:52:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:52:13.420501 543705 net.go:648] Add success.
I0320 13:52:13.423285 543705 net.go:770] primary dev: ETH0
I0320 13:52:13.423300 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:52:13.423313 543705 net.go:698] Add success.
W0320 13:52:14.455128 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:52:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 13:52:14.455191 543705 disk_worker.go:728] disk inode is not compliant
E0320 13:52:14.455870 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 13:52:14.455879 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 13:52:14.455901 543705 custom_config.go:64] query custom config with name: gpu
I0320 13:52:14.456547 543705 disk_worker.go:494] system disk:vda1
I0320 13:52:14.456577 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 13:52:15.456814 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 13:52:15.456822 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:52:16.457924 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 13:52:16.457924 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 13:52:16.457979 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:52:16.457998 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:52:16.472314 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:52:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:52:23.409794 543705 memory.go:184] no items to output this cycle
I0320 13:52:23.409808 543705 cpu.go:275] no items to output this cycle
I0320 13:52:24.585564 543705 disk_info.go:125] begin check local disk info of client
I0320 13:52:24.588007 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:52:24.588013 543705 disk_info.go:196] parse disk info done, disk is : [0xc000594dc0 0xc000594e00]
E0320 13:52:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:52:33.409787 543705 memory.go:184] no items to output this cycle
I0320 13:52:33.409790 543705 cpu.go:275] no items to output this cycle
E0320 13:52:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:52:43.409814 543705 memory.go:191] Add success.
I0320 13:52:43.409815 543705 cpu.go:282] Add success.
I0320 13:52:43.419981 543705 net.go:648] Add success.
I0320 13:52:43.422730 543705 net.go:770] primary dev: ETH0
I0320 13:52:43.422743 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:52:43.422756 543705 net.go:698] Add success.
I0320 13:52:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:52:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:52:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:52:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:52:53.409808 543705 memory.go:184] no items to output this cycle
I0320 13:52:53.409822 543705 cpu.go:275] no items to output this cycle
E0320 13:53:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:53:03.409798 543705 memory.go:184] no items to output this cycle
I0320 13:53:03.409808 543705 cpu.go:275] no items to output this cycle
E0320 13:53:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:53:13.409784 543705 memory.go:191] Add success.
I0320 13:53:13.409810 543705 cpu.go:282] Add success.
W0320 13:53:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:53:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:53:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:53:13.420585 543705 net.go:648] Add success.
I0320 13:53:13.423285 543705 net.go:770] primary dev: ETH0
I0320 13:53:13.423300 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:53:13.423313 543705 net.go:698] Add success.
I0320 13:53:14.453951 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:53:14.455151 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:53:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0320 13:53:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:53:14.456549 543705 disk_worker.go:494] system disk:vda1
I0320 13:53:14.456592 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:53:15.455980 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:53:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:53:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:53:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:53:16.472458 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:53:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:53:23.409796 543705 memory.go:184] no items to output this cycle
I0320 13:53:23.409808 543705 cpu.go:275] no items to output this cycle
I0320 13:53:24.588090 543705 disk_info.go:125] begin check local disk info of client
I0320 13:53:24.590532 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:53:24.590538 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b5c0 0xc00007b600]
E0320 13:53:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:53:33.409788 543705 memory.go:184] no items to output this cycle
I0320 13:53:33.409793 543705 cpu.go:275] no items to output this cycle
E0320 13:53:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:53:43.409818 543705 memory.go:191] Add success.
I0320 13:53:43.409826 543705 cpu.go:282] Add success.
I0320 13:53:43.419969 543705 net.go:648] Add success.
I0320 13:53:43.423021 543705 net.go:770] primary dev: ETH0
I0320 13:53:43.423037 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:53:43.423052 543705 net.go:698] Add success.
I0320 13:53:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:53:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:53:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:53:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:53:53.409784 543705 memory.go:184] no items to output this cycle
I0320 13:53:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 13:54:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:54:03.409808 543705 memory.go:184] no items to output this cycle
I0320 13:54:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 13:54:13.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:54:13.409778 543705 memory.go:191] Add success.
W0320 13:54:13.409803 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 13:54:13.409803 543705 cpu.go:282] Add success.
W0320 13:54:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:54:13.409818 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:54:13.420053 543705 net.go:648] Add success.
I0320 13:54:13.422919 543705 net.go:770] primary dev: ETH0
I0320 13:54:13.422934 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:54:13.423107 543705 net.go:698] Add success.
I0320 13:54:13.464110 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7a460e7f-8cff-4958-b835-85d3f7a03b33","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 13:54:13.464141 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 13:54:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:54:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:54:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0320 13:54:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:54:14.456682 543705 disk_worker.go:494] system disk:vda1
I0320 13:54:14.456710 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:54:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:54:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:54:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:54:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:54:16.472404 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:54:23.409822 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:54:23.409847 543705 memory.go:184] no items to output this cycle
I0320 13:54:23.409890 543705 cpu.go:275] no items to output this cycle
I0320 13:54:24.590592 543705 disk_info.go:125] begin check local disk info of client
I0320 13:54:24.593033 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:54:24.593039 543705 disk_info.go:196] parse disk info done, disk is : [0xc000483240 0xc000483280]
E0320 13:54:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:54:33.409790 543705 memory.go:184] no items to output this cycle
I0320 13:54:33.409792 543705 cpu.go:275] no items to output this cycle
I0320 13:54:38.499245 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 13:54:38.499252 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 13:54:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:54:43.410701 543705 memory.go:191] Add success.
I0320 13:54:43.409827 543705 cpu.go:282] Add success.
I0320 13:54:43.420412 543705 net.go:648] Add success.
I0320 13:54:43.423471 543705 net.go:770] primary dev: ETH0
I0320 13:54:43.423483 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:54:43.423496 543705 net.go:698] Add success.
I0320 13:54:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:54:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:54:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:54:53.410249 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:54:53.410265 543705 memory.go:184] no items to output this cycle
I0320 13:54:53.410267 543705 cpu.go:275] no items to output this cycle
E0320 13:55:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:55:03.409767 543705 memory.go:184] no items to output this cycle
I0320 13:55:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 13:55:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:55:13.409813 543705 memory.go:191] Add success.
I0320 13:55:13.409821 543705 cpu.go:282] Add success.
W0320 13:55:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:55:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:55:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:55:13.420042 543705 net.go:648] Add success.
I0320 13:55:13.422632 543705 net.go:770] primary dev: ETH0
I0320 13:55:13.422645 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:55:13.422657 543705 net.go:698] Add success.
I0320 13:55:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:55:14.455500 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:55:14.455521 543705 disk_worker.go:708] disk space is not compliant
W0320 13:55:14.455525 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:55:14.457038 543705 disk_worker.go:494] system disk:vda1
I0320 13:55:14.457066 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:55:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:55:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:55:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:55:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:55:16.472404 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:55:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:55:23.409775 543705 memory.go:184] no items to output this cycle
I0320 13:55:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 13:55:24.593118 543705 disk_info.go:125] begin check local disk info of client
I0320 13:55:24.595587 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:55:24.595593 543705 disk_info.go:196] parse disk info done, disk is : [0xc000551300 0xc000551340]
E0320 13:55:33.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:55:33.409818 543705 memory.go:184] no items to output this cycle
I0320 13:55:33.409833 543705 cpu.go:275] no items to output this cycle
E0320 13:55:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:55:43.409783 543705 memory.go:191] Add success.
I0320 13:55:43.409816 543705 cpu.go:282] Add success.
I0320 13:55:43.419894 543705 net.go:648] Add success.
I0320 13:55:43.422596 543705 net.go:770] primary dev: ETH0
I0320 13:55:43.422611 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:55:43.422627 543705 net.go:698] Add success.
I0320 13:55:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:55:46.458066 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:55:46.458094 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:55:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:55:53.409798 543705 memory.go:184] no items to output this cycle
I0320 13:55:53.409816 543705 cpu.go:275] no items to output this cycle
E0320 13:56:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:56:03.409775 543705 memory.go:184] no items to output this cycle
I0320 13:56:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 13:56:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:56:13.409806 543705 memory.go:191] Add success.
I0320 13:56:13.409816 543705 cpu.go:282] Add success.
W0320 13:56:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:56:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:56:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:56:13.420221 543705 net.go:648] Add success.
I0320 13:56:13.422905 543705 net.go:770] primary dev: ETH0
I0320 13:56:13.422921 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:56:13.422932 543705 net.go:698] Add success.
I0320 13:56:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:56:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:56:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0320 13:56:14.455176 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:56:14.456809 543705 disk_worker.go:494] system disk:vda1
I0320 13:56:14.456839 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:56:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:56:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:56:16.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:56:16.458081 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:56:16.472409 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:56:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:56:23.409769 543705 memory.go:184] no items to output this cycle
I0320 13:56:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 13:56:24.596666 543705 disk_info.go:125] begin check local disk info of client
I0320 13:56:24.599159 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:56:24.599164 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039c640 0xc00039c680]
E0320 13:56:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:56:33.409806 543705 memory.go:184] no items to output this cycle
I0320 13:56:33.409821 543705 cpu.go:275] no items to output this cycle
E0320 13:56:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:56:43.409814 543705 memory.go:191] Add success.
I0320 13:56:43.409821 543705 cpu.go:282] Add success.
I0320 13:56:43.419888 543705 net.go:648] Add success.
I0320 13:56:43.422464 543705 net.go:770] primary dev: ETH0
I0320 13:56:43.422477 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:56:43.422490 543705 net.go:698] Add success.
I0320 13:56:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:56:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:56:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:56:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:56:53.409801 543705 memory.go:184] no items to output this cycle
I0320 13:56:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 13:57:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:57:03.409779 543705 memory.go:184] no items to output this cycle
I0320 13:57:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 13:57:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:57:13.409779 543705 memory.go:191] Add success.
W0320 13:57:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 13:57:13.409809 543705 cpu.go:282] Add success.
W0320 13:57:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:57:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:57:13.425860 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 13:57:13.425945 543705 net.go:770] primary dev: ETH0
I0320 13:57:13.425957 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:57:13.425968 543705 net.go:698] Add success.
I0320 13:57:13.426330 543705 net.go:648] Add success.
I0320 13:57:13.453722 543705 event_worker.go:152] Polling the log file for events...
I0320 13:57:13.464609 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a9a1a123-db3f-455e-9540-b29ad7f6c465","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 13:57:13.464647 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 13:57:14.455136 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:57:14.455149 543705 disk_worker.go:708] disk space is not compliant
W0320 13:57:14.455151 543705 disk_worker.go:728] disk inode is not compliant
E0320 13:57:14.456261 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 13:57:14.456270 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 13:57:14.456285 543705 custom_config.go:64] query custom config with name: gpu
I0320 13:57:14.457026 543705 disk_worker.go:494] system disk:vda1
I0320 13:57:14.457066 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 13:57:15.456819 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 13:57:15.456829 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:57:16.457953 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 13:57:16.457962 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 13:57:16.458005 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:57:16.458023 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:57:16.472339 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:57:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:57:23.409791 543705 memory.go:184] no items to output this cycle
I0320 13:57:23.409802 543705 cpu.go:275] no items to output this cycle
I0320 13:57:24.599637 543705 disk_info.go:125] begin check local disk info of client
I0320 13:57:24.602081 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:57:24.602086 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5680 0xc0000c56c0]
E0320 13:57:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:57:33.409779 543705 memory.go:184] no items to output this cycle
I0320 13:57:33.409798 543705 cpu.go:275] no items to output this cycle
I0320 13:57:38.500246 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 13:57:38.500253 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 13:57:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:57:43.410714 543705 memory.go:191] Add success.
I0320 13:57:43.409801 543705 cpu.go:282] Add success.
I0320 13:57:43.420431 543705 net.go:648] Add success.
I0320 13:57:43.423034 543705 net.go:770] primary dev: ETH0
I0320 13:57:43.423047 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:57:43.423059 543705 net.go:698] Add success.
I0320 13:57:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:57:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:57:46.458090 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:57:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:57:53.409780 543705 memory.go:184] no items to output this cycle
I0320 13:57:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 13:58:03.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:58:03.409803 543705 cpu.go:275] no items to output this cycle
I0320 13:58:03.409809 543705 memory.go:184] no items to output this cycle
E0320 13:58:13.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:58:13.409827 543705 memory.go:191] Add success.
I0320 13:58:13.409833 543705 cpu.go:282] Add success.
W0320 13:58:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:58:13.409877 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:58:13.409881 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:58:13.420285 543705 net.go:648] Add success.
I0320 13:58:13.423306 543705 net.go:770] primary dev: ETH0
I0320 13:58:13.423322 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:58:13.423337 543705 net.go:698] Add success.
I0320 13:58:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:58:14.455187 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:58:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 13:58:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:58:14.456626 543705 disk_worker.go:494] system disk:vda1
I0320 13:58:14.456655 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:58:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:58:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:58:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:58:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:58:16.472383 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:58:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:58:23.409776 543705 cpu.go:275] no items to output this cycle
I0320 13:58:23.409786 543705 memory.go:184] no items to output this cycle
I0320 13:58:24.602651 543705 disk_info.go:125] begin check local disk info of client
I0320 13:58:24.605110 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:58:24.605118 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003912c0 0xc000391300]
E0320 13:58:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:58:33.409781 543705 memory.go:184] no items to output this cycle
I0320 13:58:33.409824 543705 cpu.go:275] no items to output this cycle
E0320 13:58:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:58:43.409784 543705 memory.go:191] Add success.
I0320 13:58:43.409800 543705 cpu.go:282] Add success.
I0320 13:58:43.420014 543705 net.go:648] Add success.
I0320 13:58:43.422871 543705 net.go:770] primary dev: ETH0
I0320 13:58:43.422883 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:58:43.422896 543705 net.go:698] Add success.
I0320 13:58:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:58:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:58:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:58:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:58:53.409778 543705 cpu.go:275] no items to output this cycle
I0320 13:58:53.409783 543705 memory.go:184] no items to output this cycle
E0320 13:59:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:59:03.409768 543705 memory.go:184] no items to output this cycle
I0320 13:59:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 13:59:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:59:13.409779 543705 memory.go:191] Add success.
I0320 13:59:13.409799 543705 cpu.go:282] Add success.
W0320 13:59:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 13:59:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 13:59:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 13:59:13.420126 543705 net.go:648] Add success.
I0320 13:59:13.423248 543705 net.go:770] primary dev: ETH0
I0320 13:59:13.423262 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:59:13.423273 543705 net.go:698] Add success.
I0320 13:59:14.454954 543705 custom_config.go:64] query custom config with name: gpu
W0320 13:59:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 13:59:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 13:59:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 13:59:14.456580 543705 disk_worker.go:494] system disk:vda1
I0320 13:59:14.456612 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 13:59:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 13:59:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:59:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:59:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 13:59:16.472421 543705 disk_local_worker.go:436] Get disk info: []
E0320 13:59:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:59:23.409778 543705 cpu.go:275] no items to output this cycle
I0320 13:59:23.409783 543705 memory.go:184] no items to output this cycle
I0320 13:59:24.605198 543705 disk_info.go:125] begin check local disk info of client
I0320 13:59:24.607667 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 13:59:24.607672 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a8540 0xc0004a8580]
E0320 13:59:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:59:33.409776 543705 memory.go:184] no items to output this cycle
I0320 13:59:33.409797 543705 cpu.go:275] no items to output this cycle
E0320 13:59:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:59:43.409792 543705 memory.go:191] Add success.
I0320 13:59:43.409805 543705 cpu.go:282] Add success.
I0320 13:59:43.420002 543705 net.go:648] Add success.
I0320 13:59:43.422623 543705 net.go:770] primary dev: ETH0
I0320 13:59:43.422636 543705 net.go:802] Send network stats successfully!,count is 6
I0320 13:59:43.422648 543705 net.go:698] Add success.
I0320 13:59:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 13:59:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 13:59:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 13:59:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 13:59:53.409802 543705 memory.go:184] no items to output this cycle
I0320 13:59:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 14:00:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:00:03.409770 543705 memory.go:184] no items to output this cycle
I0320 14:00:03.409781 543705 cpu.go:275] no items to output this cycle
E0320 14:00:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:00:13.409812 543705 memory.go:191] Add success.
I0320 14:00:13.409816 543705 cpu.go:282] Add success.
W0320 14:00:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:00:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:00:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:00:13.420488 543705 net.go:648] Add success.
I0320 14:00:13.423393 543705 net.go:770] primary dev: ETH0
I0320 14:00:13.423407 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:00:13.423418 543705 net.go:698] Add success.
I0320 14:00:13.463926 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"73a2dfe5-6c99-4bc7-9575-ff78559abaa2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 14:00:13.463965 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 14:00:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:00:14.455165 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:00:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0320 14:00:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:00:14.456534 543705 disk_worker.go:494] system disk:vda1
I0320 14:00:14.456579 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:00:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:00:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:00:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:00:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:00:16.472427 543705 disk_local_worker.go:436] Get disk info: []
I0320 14:00:23.409914 543705 cpu.go:275] no items to output this cycle
E0320 14:00:23.409946 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:00:23.409978 543705 memory.go:184] no items to output this cycle
I0320 14:00:24.608683 543705 disk_info.go:125] begin check local disk info of client
I0320 14:00:24.611234 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:00:24.611240 543705 disk_info.go:196] parse disk info done, disk is : [0xc00052aec0 0xc00052af00]
E0320 14:00:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:00:33.409788 543705 memory.go:184] no items to output this cycle
I0320 14:00:33.409818 543705 cpu.go:275] no items to output this cycle
I0320 14:00:38.501258 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 14:00:38.501265 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 14:00:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:00:43.410924 543705 memory.go:191] Add success.
I0320 14:00:43.409816 543705 cpu.go:282] Add success.
I0320 14:00:43.420636 543705 net.go:648] Add success.
I0320 14:00:43.423479 543705 net.go:770] primary dev: ETH0
I0320 14:00:43.423491 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:00:43.423505 543705 net.go:698] Add success.
I0320 14:00:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:00:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:00:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:00:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:00:53.409773 543705 memory.go:184] no items to output this cycle
I0320 14:00:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 14:01:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:01:03.409775 543705 memory.go:184] no items to output this cycle
I0320 14:01:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 14:01:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:01:13.409812 543705 memory.go:191] Add success.
I0320 14:01:13.409818 543705 cpu.go:282] Add success.
W0320 14:01:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:01:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:01:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:01:13.420109 543705 net.go:648] Add success.
I0320 14:01:13.423104 543705 net.go:770] primary dev: ETH0
I0320 14:01:13.423116 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:01:13.423128 543705 net.go:698] Add success.
I0320 14:01:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:01:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:01:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0320 14:01:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:01:14.456490 543705 disk_worker.go:494] system disk:vda1
I0320 14:01:14.456534 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:01:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:01:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:01:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:01:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:01:16.472377 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:01:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:01:23.409795 543705 memory.go:184] no items to output this cycle
I0320 14:01:23.409802 543705 cpu.go:275] no items to output this cycle
I0320 14:01:24.611691 543705 disk_info.go:125] begin check local disk info of client
I0320 14:01:24.614225 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:01:24.614231 543705 disk_info.go:196] parse disk info done, disk is : [0xc000289980 0xc0002899c0]
E0320 14:01:33.409868 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:01:33.409888 543705 memory.go:184] no items to output this cycle
I0320 14:01:33.409958 543705 cpu.go:275] no items to output this cycle
E0320 14:01:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:01:43.409786 543705 memory.go:191] Add success.
I0320 14:01:43.409795 543705 cpu.go:282] Add success.
I0320 14:01:43.420039 543705 net.go:648] Add success.
I0320 14:01:43.422954 543705 net.go:770] primary dev: ETH0
I0320 14:01:43.422972 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:01:43.422987 543705 net.go:698] Add success.
I0320 14:01:46.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:01:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:01:46.458088 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:01:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:01:53.409781 543705 memory.go:184] no items to output this cycle
I0320 14:01:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 14:02:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:02:03.409798 543705 memory.go:184] no items to output this cycle
I0320 14:02:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 14:02:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:02:13.409786 543705 memory.go:191] Add success.
I0320 14:02:13.409810 543705 cpu.go:282] Add success.
W0320 14:02:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:02:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:02:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:02:13.420080 543705 net.go:648] Add success.
I0320 14:02:13.422863 543705 net.go:770] primary dev: ETH0
I0320 14:02:13.422877 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:02:13.422889 543705 net.go:698] Add success.
W0320 14:02:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:02:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 14:02:14.455189 543705 disk_worker.go:728] disk inode is not compliant
E0320 14:02:14.455907 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 14:02:14.455916 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 14:02:14.455921 543705 custom_config.go:64] query custom config with name: gpu
I0320 14:02:14.456561 543705 disk_worker.go:494] system disk:vda1
I0320 14:02:14.456592 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 14:02:15.456853 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 14:02:15.456861 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:02:16.457952 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 14:02:16.457961 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 14:02:16.458006 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:02:16.458022 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:02:16.472330 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:02:23.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:02:23.409820 543705 memory.go:184] no items to output this cycle
I0320 14:02:23.409823 543705 cpu.go:275] no items to output this cycle
I0320 14:02:24.614716 543705 disk_info.go:125] begin check local disk info of client
I0320 14:02:24.617297 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:02:24.617303 543705 disk_info.go:196] parse disk info done, disk is : [0xc000298480 0xc0002984c0]
E0320 14:02:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:02:33.409784 543705 memory.go:184] no items to output this cycle
I0320 14:02:33.409806 543705 cpu.go:275] no items to output this cycle
E0320 14:02:43.409892 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:02:43.409949 543705 memory.go:191] Add success.
I0320 14:02:43.410091 543705 cpu.go:282] Add success.
I0320 14:02:43.419730 543705 net.go:648] Add success.
I0320 14:02:43.422391 543705 net.go:770] primary dev: ETH0
I0320 14:02:43.422409 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:02:43.422424 543705 net.go:698] Add success.
I0320 14:02:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:02:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:02:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:02:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:02:53.409809 543705 memory.go:184] no items to output this cycle
I0320 14:02:53.409819 543705 cpu.go:275] no items to output this cycle
E0320 14:03:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:03:03.409789 543705 memory.go:184] no items to output this cycle
I0320 14:03:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 14:03:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:03:13.409793 543705 memory.go:191] Add success.
I0320 14:03:13.409796 543705 cpu.go:282] Add success.
W0320 14:03:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:03:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:03:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:03:13.420234 543705 net.go:648] Add success.
I0320 14:03:13.423321 543705 net.go:770] primary dev: ETH0
I0320 14:03:13.423334 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:03:13.423346 543705 net.go:698] Add success.
I0320 14:03:13.470347 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9f6e0dde-6328-4613-b2e7-684e250fb700","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 14:03:13.470381 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 14:03:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:03:14.455137 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:03:14.455225 543705 disk_worker.go:708] disk space is not compliant
W0320 14:03:14.455228 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:03:14.456760 543705 disk_worker.go:494] system disk:vda1
I0320 14:03:14.456791 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:03:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:03:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:03:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:03:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:03:16.472384 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:03:23.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:03:23.409808 543705 memory.go:184] no items to output this cycle
I0320 14:03:23.409823 543705 cpu.go:275] no items to output this cycle
I0320 14:03:24.617670 543705 disk_info.go:125] begin check local disk info of client
I0320 14:03:24.620109 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:03:24.620114 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048a180 0xc00048a1c0]
E0320 14:03:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:03:33.409789 543705 memory.go:184] no items to output this cycle
I0320 14:03:33.409818 543705 cpu.go:275] no items to output this cycle
I0320 14:03:38.502291 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 14:03:38.502298 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 14:03:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:03:43.410724 543705 memory.go:191] Add success.
I0320 14:03:43.409820 543705 cpu.go:282] Add success.
I0320 14:03:43.420469 543705 net.go:648] Add success.
I0320 14:03:43.423242 543705 net.go:770] primary dev: ETH0
I0320 14:03:43.423255 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:03:43.423267 543705 net.go:698] Add success.
I0320 14:03:46.458000 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:03:46.458076 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:03:46.458103 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:03:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:03:53.409804 543705 memory.go:184] no items to output this cycle
I0320 14:03:53.409819 543705 cpu.go:275] no items to output this cycle
E0320 14:04:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:04:03.409776 543705 memory.go:184] no items to output this cycle
I0320 14:04:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 14:04:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:04:13.409809 543705 memory.go:191] Add success.
I0320 14:04:13.409821 543705 cpu.go:282] Add success.
W0320 14:04:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:04:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:04:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:04:13.420155 543705 net.go:648] Add success.
I0320 14:04:13.422917 543705 net.go:770] primary dev: ETH0
I0320 14:04:13.422931 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:04:13.422945 543705 net.go:698] Add success.
I0320 14:04:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:04:14.455107 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:04:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 14:04:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:04:14.456586 543705 disk_worker.go:494] system disk:vda1
I0320 14:04:14.456619 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:04:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:04:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:04:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:04:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:04:16.472356 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:04:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:04:23.409774 543705 cpu.go:275] no items to output this cycle
I0320 14:04:23.409780 543705 memory.go:184] no items to output this cycle
I0320 14:04:24.620733 543705 disk_info.go:125] begin check local disk info of client
I0320 14:04:24.623153 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:04:24.623159 543705 disk_info.go:196] parse disk info done, disk is : [0xc000386c80 0xc000386cc0]
E0320 14:04:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:04:33.409785 543705 memory.go:184] no items to output this cycle
I0320 14:04:33.409804 543705 cpu.go:275] no items to output this cycle
E0320 14:04:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:04:43.409909 543705 memory.go:191] Add success.
I0320 14:04:43.409944 543705 cpu.go:282] Add success.
I0320 14:04:43.419740 543705 net.go:648] Add success.
I0320 14:04:43.422973 543705 net.go:770] primary dev: ETH0
I0320 14:04:43.422987 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:04:43.422998 543705 net.go:698] Add success.
I0320 14:04:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:04:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:04:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:04:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:04:53.409801 543705 memory.go:184] no items to output this cycle
I0320 14:04:53.409815 543705 cpu.go:275] no items to output this cycle
E0320 14:05:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:05:03.409783 543705 cpu.go:275] no items to output this cycle
I0320 14:05:03.409790 543705 memory.go:184] no items to output this cycle
E0320 14:05:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:05:13.409779 543705 memory.go:191] Add success.
I0320 14:05:13.409798 543705 cpu.go:282] Add success.
W0320 14:05:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:05:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:05:13.409818 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:05:13.420138 543705 net.go:648] Add success.
I0320 14:05:13.423265 543705 net.go:770] primary dev: ETH0
I0320 14:05:13.423280 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:05:13.423293 543705 net.go:698] Add success.
I0320 14:05:14.454955 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:05:14.455146 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:05:14.455156 543705 disk_worker.go:708] disk space is not compliant
W0320 14:05:14.455158 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:05:14.456476 543705 disk_worker.go:494] system disk:vda1
I0320 14:05:14.456533 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:05:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:05:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:05:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:05:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:05:16.472395 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:05:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:05:23.409776 543705 cpu.go:275] no items to output this cycle
I0320 14:05:23.409785 543705 memory.go:184] no items to output this cycle
I0320 14:05:24.623748 543705 disk_info.go:125] begin check local disk info of client
I0320 14:05:24.626236 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:05:24.626242 543705 disk_info.go:196] parse disk info done, disk is : [0xc000331980 0xc0003319c0]
E0320 14:05:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:05:33.409803 543705 memory.go:184] no items to output this cycle
I0320 14:05:33.409820 543705 cpu.go:275] no items to output this cycle
E0320 14:05:43.409863 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:05:43.409892 543705 memory.go:191] Add success.
I0320 14:05:43.409894 543705 cpu.go:282] Add success.
I0320 14:05:43.419723 543705 net.go:648] Add success.
I0320 14:05:43.422991 543705 net.go:770] primary dev: ETH0
I0320 14:05:43.423003 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:05:43.423014 543705 net.go:698] Add success.
I0320 14:05:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:05:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:05:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:05:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:05:53.409780 543705 memory.go:184] no items to output this cycle
I0320 14:05:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 14:06:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:06:03.409789 543705 memory.go:184] no items to output this cycle
I0320 14:06:03.409806 543705 cpu.go:275] no items to output this cycle
E0320 14:06:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:06:13.409806 543705 memory.go:191] Add success.
I0320 14:06:13.409812 543705 cpu.go:282] Add success.
W0320 14:06:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:06:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:06:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:06:13.420063 543705 net.go:648] Add success.
I0320 14:06:13.423097 543705 net.go:770] primary dev: ETH0
I0320 14:06:13.423110 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:06:13.423122 543705 net.go:698] Add success.
I0320 14:06:13.469119 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"063fb934-58de-44bd-96ac-3104517b4175","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 14:06:13.469151 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 14:06:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:06:14.455193 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:06:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0320 14:06:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:06:14.456598 543705 disk_worker.go:494] system disk:vda1
I0320 14:06:14.456628 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:06:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:06:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:06:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:06:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:06:16.472372 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:06:23.410245 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:06:23.410260 543705 memory.go:184] no items to output this cycle
I0320 14:06:23.410288 543705 cpu.go:275] no items to output this cycle
I0320 14:06:24.626764 543705 disk_info.go:125] begin check local disk info of client
I0320 14:06:24.629200 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:06:24.629206 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5d40 0xc0000c5d80]
E0320 14:06:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:06:33.409816 543705 memory.go:184] no items to output this cycle
I0320 14:06:33.409825 543705 cpu.go:275] no items to output this cycle
I0320 14:06:38.503265 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 14:06:38.503271 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 14:06:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:06:43.410654 543705 memory.go:191] Add success.
I0320 14:06:43.409808 543705 cpu.go:282] Add success.
I0320 14:06:43.420353 543705 net.go:648] Add success.
I0320 14:06:43.422927 543705 net.go:770] primary dev: ETH0
I0320 14:06:43.422940 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:06:43.422954 543705 net.go:698] Add success.
I0320 14:06:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:06:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:06:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:06:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:06:53.409777 543705 memory.go:184] no items to output this cycle
I0320 14:06:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 14:07:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:07:03.409803 543705 memory.go:184] no items to output this cycle
I0320 14:07:03.409819 543705 cpu.go:275] no items to output this cycle
E0320 14:07:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:07:13.409804 543705 memory.go:191] Add success.
I0320 14:07:13.409815 543705 cpu.go:282] Add success.
W0320 14:07:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:07:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:07:13.409856 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:07:13.420068 543705 net.go:648] Add success.
I0320 14:07:13.422719 543705 net.go:770] primary dev: ETH0
I0320 14:07:13.422731 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:07:13.422744 543705 net.go:698] Add success.
I0320 14:07:13.453367 543705 event_worker.go:152] Polling the log file for events...
W0320 14:07:14.455136 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:07:14.455227 543705 disk_worker.go:708] disk space is not compliant
W0320 14:07:14.455232 543705 disk_worker.go:728] disk inode is not compliant
E0320 14:07:14.455901 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 14:07:14.455910 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 14:07:14.455916 543705 custom_config.go:64] query custom config with name: gpu
I0320 14:07:14.456812 543705 disk_worker.go:494] system disk:vda1
I0320 14:07:14.456841 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 14:07:15.456814 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 14:07:15.456822 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:07:16.457939 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 14:07:16.457948 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 14:07:16.457992 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:07:16.458009 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:07:16.472321 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:07:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:07:23.409774 543705 memory.go:184] no items to output this cycle
I0320 14:07:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 14:07:24.629680 543705 disk_info.go:125] begin check local disk info of client
I0320 14:07:24.632071 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:07:24.632077 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329780 0xc0003297c0]
E0320 14:07:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:07:33.409787 543705 memory.go:184] no items to output this cycle
I0320 14:07:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 14:07:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:07:43.409789 543705 memory.go:191] Add success.
I0320 14:07:43.409819 543705 cpu.go:282] Add success.
I0320 14:07:43.419968 543705 net.go:648] Add success.
I0320 14:07:43.422753 543705 net.go:770] primary dev: ETH0
I0320 14:07:43.422766 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:07:43.422778 543705 net.go:698] Add success.
I0320 14:07:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:07:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:07:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:07:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:07:53.409806 543705 memory.go:184] no items to output this cycle
I0320 14:07:53.409819 543705 cpu.go:275] no items to output this cycle
E0320 14:08:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:08:03.409784 543705 memory.go:184] no items to output this cycle
I0320 14:08:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 14:08:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:08:13.409803 543705 memory.go:191] Add success.
I0320 14:08:13.409804 543705 cpu.go:282] Add success.
W0320 14:08:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:08:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:08:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:08:13.420134 543705 net.go:648] Add success.
I0320 14:08:13.423558 543705 net.go:770] primary dev: ETH0
I0320 14:08:13.423574 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:08:13.423588 543705 net.go:698] Add success.
I0320 14:08:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:08:14.455103 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:08:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 14:08:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:08:14.456504 543705 disk_worker.go:494] system disk:vda1
I0320 14:08:14.456548 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:08:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:08:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:08:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:08:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:08:16.472390 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:08:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:08:23.409796 543705 memory.go:184] no items to output this cycle
I0320 14:08:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 14:08:24.632801 543705 disk_info.go:125] begin check local disk info of client
I0320 14:08:24.635235 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:08:24.635241 543705 disk_info.go:196] parse disk info done, disk is : [0xc000296480 0xc0002964c0]
E0320 14:08:33.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:08:33.409821 543705 memory.go:184] no items to output this cycle
I0320 14:08:33.409831 543705 cpu.go:275] no items to output this cycle
E0320 14:08:43.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:08:43.409834 543705 memory.go:191] Add success.
I0320 14:08:43.409843 543705 cpu.go:282] Add success.
I0320 14:08:43.420003 543705 net.go:648] Add success.
I0320 14:08:43.423333 543705 net.go:770] primary dev: ETH0
I0320 14:08:43.423348 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:08:43.423362 543705 net.go:698] Add success.
I0320 14:08:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:08:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:08:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:08:53.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:08:53.409818 543705 memory.go:184] no items to output this cycle
I0320 14:08:53.409827 543705 cpu.go:275] no items to output this cycle
E0320 14:09:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:09:03.409785 543705 memory.go:184] no items to output this cycle
I0320 14:09:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 14:09:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:09:13.409809 543705 memory.go:191] Add success.
I0320 14:09:13.409817 543705 cpu.go:282] Add success.
W0320 14:09:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:09:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:09:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:09:13.420139 543705 net.go:648] Add success.
I0320 14:09:13.423044 543705 net.go:770] primary dev: ETH0
I0320 14:09:13.423058 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:09:13.423070 543705 net.go:698] Add success.
I0320 14:09:13.471158 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"cbf034a6-653a-4a32-bc5f-0d789ef92578","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 14:09:13.471192 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 14:09:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:09:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:09:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 14:09:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:09:14.456586 543705 disk_worker.go:494] system disk:vda1
I0320 14:09:14.456618 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:09:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:09:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:09:16.458024 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:09:16.458044 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:09:16.472373 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:09:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:09:23.409770 543705 memory.go:184] no items to output this cycle
I0320 14:09:23.409811 543705 cpu.go:275] no items to output this cycle
I0320 14:09:24.635322 543705 disk_info.go:125] begin check local disk info of client
I0320 14:09:24.637863 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:09:24.637869 543705 disk_info.go:196] parse disk info done, disk is : [0xc000481600 0xc000481640]
E0320 14:09:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:09:33.409880 543705 memory.go:184] no items to output this cycle
I0320 14:09:33.409957 543705 cpu.go:275] no items to output this cycle
I0320 14:09:38.504268 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 14:09:38.504274 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 14:09:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:09:43.410604 543705 memory.go:191] Add success.
I0320 14:09:43.409838 543705 cpu.go:282] Add success.
I0320 14:09:43.420297 543705 net.go:648] Add success.
I0320 14:09:43.422959 543705 net.go:770] primary dev: ETH0
I0320 14:09:43.422973 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:09:43.422985 543705 net.go:698] Add success.
I0320 14:09:46.457998 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:09:46.458072 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:09:46.458100 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:09:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:09:53.409798 543705 memory.go:184] no items to output this cycle
I0320 14:09:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 14:10:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:10:03.409771 543705 memory.go:184] no items to output this cycle
I0320 14:10:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 14:10:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:10:13.409807 543705 memory.go:191] Add success.
I0320 14:10:13.409817 543705 cpu.go:282] Add success.
W0320 14:10:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:10:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:10:13.409853 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:10:13.420056 543705 net.go:648] Add success.
I0320 14:10:13.423092 543705 net.go:770] primary dev: ETH0
I0320 14:10:13.423106 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:10:13.423118 543705 net.go:698] Add success.
I0320 14:10:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:10:14.455107 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:10:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0320 14:10:14.455170 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:10:14.456491 543705 disk_worker.go:494] system disk:vda1
I0320 14:10:14.456535 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:10:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:10:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:10:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:10:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:10:16.472395 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:10:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:10:23.409780 543705 memory.go:184] no items to output this cycle
I0320 14:10:23.409781 543705 cpu.go:275] no items to output this cycle
I0320 14:10:24.638832 543705 disk_info.go:125] begin check local disk info of client
I0320 14:10:24.641291 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:10:24.641297 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fbb00 0xc0001fbb40]
E0320 14:10:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:10:33.409808 543705 memory.go:184] no items to output this cycle
I0320 14:10:33.409823 543705 cpu.go:275] no items to output this cycle
E0320 14:10:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:10:43.409803 543705 memory.go:191] Add success.
I0320 14:10:43.409805 543705 cpu.go:282] Add success.
I0320 14:10:43.420004 543705 net.go:648] Add success.
I0320 14:10:43.422599 543705 net.go:770] primary dev: ETH0
I0320 14:10:43.422613 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:10:43.422628 543705 net.go:698] Add success.
I0320 14:10:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:10:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:10:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:10:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:10:53.409766 543705 memory.go:184] no items to output this cycle
I0320 14:10:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 14:11:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:11:03.409777 543705 memory.go:184] no items to output this cycle
I0320 14:11:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 14:11:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:11:13.409780 543705 memory.go:191] Add success.
W0320 14:11:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 14:11:13.409813 543705 cpu.go:282] Add success.
W0320 14:11:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:11:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:11:13.420184 543705 net.go:648] Add success.
I0320 14:11:13.422888 543705 net.go:770] primary dev: ETH0
I0320 14:11:13.422902 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:11:13.422916 543705 net.go:698] Add success.
I0320 14:11:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:11:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:11:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0320 14:11:14.455214 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:11:14.456589 543705 disk_worker.go:494] system disk:vda1
I0320 14:11:14.456618 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:11:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:11:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:11:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:11:16.458046 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:11:16.472369 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:11:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:11:23.409767 543705 memory.go:184] no items to output this cycle
I0320 14:11:23.409799 543705 cpu.go:275] no items to output this cycle
I0320 14:11:24.641674 543705 disk_info.go:125] begin check local disk info of client
I0320 14:11:24.644142 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:11:24.644148 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002791c0 0xc000279200]
E0320 14:11:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:11:33.409782 543705 memory.go:184] no items to output this cycle
I0320 14:11:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 14:11:43.409870 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:11:43.409899 543705 memory.go:191] Add success.
I0320 14:11:43.410045 543705 cpu.go:282] Add success.
I0320 14:11:43.419710 543705 net.go:648] Add success.
I0320 14:11:43.422715 543705 net.go:770] primary dev: ETH0
I0320 14:11:43.422728 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:11:43.422739 543705 net.go:698] Add success.
I0320 14:11:46.458156 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:11:46.458229 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:11:46.458259 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:11:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:11:53.409773 543705 memory.go:184] no items to output this cycle
I0320 14:11:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 14:12:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:12:03.409776 543705 memory.go:184] no items to output this cycle
I0320 14:12:03.409780 543705 cpu.go:275] no items to output this cycle
E0320 14:12:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:12:13.409814 543705 memory.go:191] Add success.
I0320 14:12:13.409828 543705 cpu.go:282] Add success.
W0320 14:12:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:12:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:12:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:12:13.420183 543705 net.go:648] Add success.
I0320 14:12:13.423269 543705 net.go:770] primary dev: ETH0
I0320 14:12:13.423283 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:12:13.423295 543705 net.go:698] Add success.
I0320 14:12:13.567725 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3a4a1e0e-b70b-4cb9-bd63-fe07be62e10b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 14:12:13.567760 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 14:12:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:12:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 14:12:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:12:14.456866 543705 disk_worker.go:494] system disk:vda1
I0320 14:12:14.456905 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 14:12:14.457117 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 14:12:14.457125 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 14:12:14.457129 543705 custom_config.go:64] query custom config with name: gpu
E0320 14:12:15.456800 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 14:12:15.456808 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:12:16.457938 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 14:12:16.457946 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 14:12:16.457990 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:12:16.458008 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:12:16.472334 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:12:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:12:23.409808 543705 memory.go:184] no items to output this cycle
I0320 14:12:23.409818 543705 cpu.go:275] no items to output this cycle
I0320 14:12:24.644230 543705 disk_info.go:125] begin check local disk info of client
I0320 14:12:24.646733 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:12:24.646738 543705 disk_info.go:196] parse disk info done, disk is : [0xc00024bcc0 0xc00024bd00]
E0320 14:12:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:12:33.409790 543705 memory.go:184] no items to output this cycle
I0320 14:12:33.409802 543705 cpu.go:275] no items to output this cycle
I0320 14:12:38.505283 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 14:12:38.505290 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 14:12:43.409865 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:12:43.410789 543705 memory.go:191] Add success.
I0320 14:12:43.409964 543705 cpu.go:282] Add success.
I0320 14:12:43.419757 543705 net.go:648] Add success.
I0320 14:12:43.422578 543705 net.go:770] primary dev: ETH0
I0320 14:12:43.422593 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:12:43.422608 543705 net.go:698] Add success.
I0320 14:12:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:12:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:12:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:12:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:12:53.409798 543705 memory.go:184] no items to output this cycle
I0320 14:12:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 14:13:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:13:03.409765 543705 memory.go:184] no items to output this cycle
I0320 14:13:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 14:13:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:13:13.409810 543705 memory.go:191] Add success.
I0320 14:13:13.409813 543705 cpu.go:282] Add success.
W0320 14:13:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:13:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:13:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:13:13.420531 543705 net.go:648] Add success.
I0320 14:13:13.423109 543705 net.go:770] primary dev: ETH0
I0320 14:13:13.423124 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:13:13.423138 543705 net.go:698] Add success.
I0320 14:13:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:13:14.455137 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:13:14.455218 543705 disk_worker.go:708] disk space is not compliant
W0320 14:13:14.455222 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:13:14.456617 543705 disk_worker.go:494] system disk:vda1
I0320 14:13:14.456648 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:13:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:13:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:13:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:13:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:13:16.472384 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:13:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:13:23.409803 543705 memory.go:184] no items to output this cycle
I0320 14:13:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 14:13:24.647876 543705 disk_info.go:125] begin check local disk info of client
I0320 14:13:24.650497 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:13:24.650504 543705 disk_info.go:196] parse disk info done, disk is : [0xc000216940 0xc000216980]
E0320 14:13:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:13:33.409789 543705 memory.go:184] no items to output this cycle
I0320 14:13:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 14:13:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:13:43.409790 543705 memory.go:191] Add success.
I0320 14:13:43.409793 543705 cpu.go:282] Add success.
I0320 14:13:43.419723 543705 net.go:648] Add success.
I0320 14:13:43.422506 543705 net.go:770] primary dev: ETH0
I0320 14:13:43.422519 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:13:43.422530 543705 net.go:698] Add success.
I0320 14:13:46.457998 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:13:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:13:46.458089 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:13:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:13:53.409777 543705 memory.go:184] no items to output this cycle
I0320 14:13:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 14:14:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:14:03.409771 543705 memory.go:184] no items to output this cycle
I0320 14:14:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 14:14:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:14:13.409807 543705 memory.go:191] Add success.
I0320 14:14:13.409810 543705 cpu.go:282] Add success.
W0320 14:14:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:14:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:14:13.409859 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:14:13.420053 543705 net.go:648] Add success.
I0320 14:14:13.423411 543705 net.go:770] primary dev: ETH0
I0320 14:14:13.423423 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:14:13.423436 543705 net.go:698] Add success.
I0320 14:14:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:14:14.455187 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:14:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 14:14:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:14:14.456602 543705 disk_worker.go:494] system disk:vda1
I0320 14:14:14.456631 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:14:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:14:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:14:16.458025 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:14:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:14:16.472374 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:14:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:14:23.409784 543705 cpu.go:275] no items to output this cycle
I0320 14:14:23.409786 543705 memory.go:184] no items to output this cycle
I0320 14:14:24.650883 543705 disk_info.go:125] begin check local disk info of client
I0320 14:14:24.653346 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:14:24.653353 543705 disk_info.go:196] parse disk info done, disk is : [0xc000241a40 0xc000241a80]
E0320 14:14:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:14:33.409781 543705 memory.go:184] no items to output this cycle
I0320 14:14:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 14:14:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:14:43.409808 543705 memory.go:191] Add success.
I0320 14:14:43.409814 543705 cpu.go:282] Add success.
I0320 14:14:43.420042 543705 net.go:648] Add success.
I0320 14:14:43.423295 543705 net.go:770] primary dev: ETH0
I0320 14:14:43.423308 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:14:43.423320 543705 net.go:698] Add success.
I0320 14:14:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:14:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:14:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:14:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:14:53.409769 543705 memory.go:184] no items to output this cycle
I0320 14:14:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 14:15:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:15:03.409781 543705 memory.go:184] no items to output this cycle
I0320 14:15:03.409784 543705 cpu.go:275] no items to output this cycle
E0320 14:15:13.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:15:13.409771 543705 memory.go:191] Add success.
W0320 14:15:13.409797 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 14:15:13.409804 543705 cpu.go:282] Add success.
W0320 14:15:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:15:13.409812 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:15:13.420122 543705 net.go:648] Add success.
I0320 14:15:13.423179 543705 net.go:770] primary dev: ETH0
I0320 14:15:13.423192 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:15:13.423204 543705 net.go:698] Add success.
I0320 14:15:13.468322 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9576cbae-fb28-4970-b688-d45577f5d98d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 14:15:13.468357 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 14:15:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:15:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:15:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0320 14:15:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:15:14.456550 543705 disk_worker.go:494] system disk:vda1
I0320 14:15:14.456593 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:15:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:15:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:15:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:15:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:15:16.472380 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:15:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:15:23.409773 543705 memory.go:184] no items to output this cycle
I0320 14:15:23.409812 543705 cpu.go:275] no items to output this cycle
I0320 14:15:24.653672 543705 disk_info.go:125] begin check local disk info of client
I0320 14:15:24.656164 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:15:24.656170 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007bac0 0xc00007bb00]
E0320 14:15:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:15:33.409814 543705 memory.go:184] no items to output this cycle
I0320 14:15:33.409827 543705 cpu.go:275] no items to output this cycle
I0320 14:15:38.506276 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 14:15:38.506296 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 14:15:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:15:43.410728 543705 memory.go:191] Add success.
I0320 14:15:43.409841 543705 cpu.go:282] Add success.
I0320 14:15:43.420430 543705 net.go:648] Add success.
I0320 14:15:43.423311 543705 net.go:770] primary dev: ETH0
I0320 14:15:43.423323 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:15:43.423337 543705 net.go:698] Add success.
I0320 14:15:46.458013 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:15:46.458075 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:15:46.458103 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:15:53.410554 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:15:53.410568 543705 cpu.go:275] no items to output this cycle
I0320 14:15:53.410573 543705 memory.go:184] no items to output this cycle
E0320 14:16:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:16:03.409773 543705 memory.go:184] no items to output this cycle
I0320 14:16:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 14:16:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:16:13.409793 543705 memory.go:191] Add success.
I0320 14:16:13.409793 543705 cpu.go:282] Add success.
W0320 14:16:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:16:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:16:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:16:13.420180 543705 net.go:648] Add success.
I0320 14:16:13.422943 543705 net.go:770] primary dev: ETH0
I0320 14:16:13.422957 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:16:13.422969 543705 net.go:698] Add success.
I0320 14:16:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:16:14.455153 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:16:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0320 14:16:14.455170 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:16:14.456513 543705 disk_worker.go:494] system disk:vda1
I0320 14:16:14.456557 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:16:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:16:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:16:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:16:16.458074 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:16:16.472421 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:16:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:16:23.409771 543705 memory.go:184] no items to output this cycle
I0320 14:16:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 14:16:24.656250 543705 disk_info.go:125] begin check local disk info of client
I0320 14:16:24.658814 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:16:24.658820 543705 disk_info.go:196] parse disk info done, disk is : [0xc000331240 0xc000331280]
E0320 14:16:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:16:33.409794 543705 memory.go:184] no items to output this cycle
I0320 14:16:33.409830 543705 cpu.go:275] no items to output this cycle
E0320 14:16:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:16:43.409778 543705 memory.go:191] Add success.
I0320 14:16:43.409800 543705 cpu.go:282] Add success.
I0320 14:16:43.420089 543705 net.go:648] Add success.
I0320 14:16:43.422961 543705 net.go:770] primary dev: ETH0
I0320 14:16:43.422975 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:16:43.422987 543705 net.go:698] Add success.
I0320 14:16:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:16:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:16:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:16:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:16:53.409804 543705 memory.go:184] no items to output this cycle
I0320 14:16:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 14:17:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:17:03.409785 543705 memory.go:184] no items to output this cycle
I0320 14:17:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 14:17:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:17:13.409779 543705 cpu.go:282] Add success.
I0320 14:17:13.409788 543705 memory.go:191] Add success.
W0320 14:17:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:17:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:17:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:17:13.420055 543705 net.go:648] Add success.
I0320 14:17:13.422846 543705 net.go:770] primary dev: ETH0
I0320 14:17:13.422859 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:17:13.422870 543705 net.go:698] Add success.
I0320 14:17:13.453392 543705 event_worker.go:152] Polling the log file for events...
W0320 14:17:14.455174 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:17:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 14:17:14.455187 543705 disk_worker.go:728] disk inode is not compliant
E0320 14:17:14.455877 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 14:17:14.455886 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 14:17:14.455892 543705 custom_config.go:64] query custom config with name: gpu
I0320 14:17:14.456556 543705 disk_worker.go:494] system disk:vda1
I0320 14:17:14.456587 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 14:17:15.456942 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 14:17:15.456955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:17:16.457960 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 14:17:16.457959 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 14:17:16.458015 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:17:16.458036 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:17:16.472506 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:17:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:17:23.409796 543705 memory.go:184] no items to output this cycle
I0320 14:17:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 14:17:24.659929 543705 disk_info.go:125] begin check local disk info of client
I0320 14:17:24.662488 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:17:24.662493 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5180 0xc0000c51c0]
E0320 14:17:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:17:33.409790 543705 memory.go:184] no items to output this cycle
I0320 14:17:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 14:17:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:17:43.409814 543705 memory.go:191] Add success.
I0320 14:17:43.409824 543705 cpu.go:282] Add success.
I0320 14:17:43.419907 543705 net.go:648] Add success.
I0320 14:17:43.422648 543705 net.go:770] primary dev: ETH0
I0320 14:17:43.422660 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:17:43.422672 543705 net.go:698] Add success.
I0320 14:17:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:17:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:17:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:17:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:17:53.409787 543705 memory.go:184] no items to output this cycle
I0320 14:17:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 14:18:03.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:18:03.409763 543705 memory.go:184] no items to output this cycle
I0320 14:18:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 14:18:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:18:13.409815 543705 memory.go:191] Add success.
I0320 14:18:13.409822 543705 cpu.go:282] Add success.
W0320 14:18:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:18:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:18:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:18:13.420298 543705 net.go:648] Add success.
I0320 14:18:13.423099 543705 net.go:770] primary dev: ETH0
I0320 14:18:13.423112 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:18:13.423124 543705 net.go:698] Add success.
I0320 14:18:13.469502 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"51a6d140-63d8-49d1-abfc-36379f5ed4f8","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 14:18:13.469534 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 14:18:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:18:14.455118 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:18:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0320 14:18:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:18:14.456551 543705 disk_worker.go:494] system disk:vda1
I0320 14:18:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:18:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:18:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:18:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:18:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:18:16.472420 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:18:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:18:23.409783 543705 memory.go:184] no items to output this cycle
I0320 14:18:23.409808 543705 cpu.go:275] no items to output this cycle
I0320 14:18:24.662945 543705 disk_info.go:125] begin check local disk info of client
I0320 14:18:24.665382 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:18:24.665388 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003288c0 0xc000328900]
E0320 14:18:33.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:18:33.409814 543705 memory.go:184] no items to output this cycle
I0320 14:18:33.409818 543705 cpu.go:275] no items to output this cycle
I0320 14:18:38.507283 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 14:18:38.507290 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 14:18:43.409802 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:18:43.410580 543705 memory.go:191] Add success.
I0320 14:18:43.409839 543705 cpu.go:282] Add success.
I0320 14:18:43.420250 543705 net.go:648] Add success.
I0320 14:18:43.422885 543705 net.go:770] primary dev: ETH0
I0320 14:18:43.422898 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:18:43.422910 543705 net.go:698] Add success.
I0320 14:18:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:18:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:18:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:18:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:18:53.409790 543705 memory.go:184] no items to output this cycle
I0320 14:18:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 14:19:03.409860 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:19:03.409884 543705 memory.go:184] no items to output this cycle
I0320 14:19:03.409949 543705 cpu.go:275] no items to output this cycle
E0320 14:19:13.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:19:13.409835 543705 memory.go:191] Add success.
I0320 14:19:13.409839 543705 cpu.go:282] Add success.
W0320 14:19:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:19:13.409882 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:19:13.409887 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:19:13.420161 543705 net.go:648] Add success.
I0320 14:19:13.423359 543705 net.go:770] primary dev: ETH0
I0320 14:19:13.423371 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:19:13.423383 543705 net.go:698] Add success.
I0320 14:19:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:19:14.455138 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:19:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0320 14:19:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:19:14.456607 543705 disk_worker.go:494] system disk:vda1
I0320 14:19:14.456639 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:19:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:19:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:19:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:19:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:19:16.472409 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:19:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:19:23.409779 543705 memory.go:184] no items to output this cycle
I0320 14:19:23.409784 543705 cpu.go:275] no items to output this cycle
I0320 14:19:24.665671 543705 disk_info.go:125] begin check local disk info of client
I0320 14:19:24.668113 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:19:24.668119 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003318c0 0xc000331900]
E0320 14:19:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:19:33.409807 543705 memory.go:184] no items to output this cycle
I0320 14:19:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 14:19:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:19:43.409789 543705 memory.go:191] Add success.
I0320 14:19:43.409817 543705 cpu.go:282] Add success.
I0320 14:19:43.419886 543705 net.go:648] Add success.
I0320 14:19:43.422958 543705 net.go:770] primary dev: ETH0
I0320 14:19:43.422971 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:19:43.422983 543705 net.go:698] Add success.
I0320 14:19:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:19:46.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:19:46.458056 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:19:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:19:53.409792 543705 memory.go:184] no items to output this cycle
I0320 14:19:53.409816 543705 cpu.go:275] no items to output this cycle
E0320 14:20:03.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:20:03.409772 543705 memory.go:184] no items to output this cycle
I0320 14:20:03.409809 543705 cpu.go:275] no items to output this cycle
E0320 14:20:13.409870 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:20:13.409907 543705 memory.go:191] Add success.
I0320 14:20:13.409981 543705 cpu.go:282] Add success.
W0320 14:20:13.410025 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:20:13.410047 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:20:13.410052 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:20:13.419754 543705 net.go:648] Add success.
I0320 14:20:13.422548 543705 net.go:770] primary dev: ETH0
I0320 14:20:13.422562 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:20:13.422577 543705 net.go:698] Add success.
I0320 14:20:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:20:14.455149 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:20:14.455160 543705 disk_worker.go:708] disk space is not compliant
W0320 14:20:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:20:14.456471 543705 disk_worker.go:494] system disk:vda1
I0320 14:20:14.456513 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:20:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:20:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:20:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:20:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:20:16.472375 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:20:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:20:23.409766 543705 memory.go:184] no items to output this cycle
I0320 14:20:23.409816 543705 cpu.go:275] no items to output this cycle
I0320 14:20:24.668984 543705 disk_info.go:125] begin check local disk info of client
I0320 14:20:24.671420 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:20:24.671425 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a22c0 0xc0004a2300]
E0320 14:20:33.409806 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:20:33.409826 543705 memory.go:184] no items to output this cycle
I0320 14:20:33.409834 543705 cpu.go:275] no items to output this cycle
E0320 14:20:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:20:43.409825 543705 memory.go:191] Add success.
I0320 14:20:43.409832 543705 cpu.go:282] Add success.
I0320 14:20:43.420002 543705 net.go:648] Add success.
I0320 14:20:43.422614 543705 net.go:770] primary dev: ETH0
I0320 14:20:43.422629 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:20:43.422643 543705 net.go:698] Add success.
I0320 14:20:46.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:20:46.458067 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:20:46.458093 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:20:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:20:53.409805 543705 memory.go:184] no items to output this cycle
I0320 14:20:53.409815 543705 cpu.go:275] no items to output this cycle
E0320 14:21:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:21:03.409767 543705 memory.go:184] no items to output this cycle
I0320 14:21:03.409802 543705 cpu.go:275] no items to output this cycle
E0320 14:21:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:21:13.409799 543705 memory.go:191] Add success.
I0320 14:21:13.409802 543705 cpu.go:282] Add success.
W0320 14:21:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:21:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:21:13.409853 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:21:13.420433 543705 net.go:648] Add success.
I0320 14:21:13.423143 543705 net.go:770] primary dev: ETH0
I0320 14:21:13.423157 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:21:13.423169 543705 net.go:698] Add success.
I0320 14:21:13.463180 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"120abda2-aa52-4368-b912-d60574b946d1","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 14:21:13.463215 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 14:21:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:21:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:21:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 14:21:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:21:14.456566 543705 disk_worker.go:494] system disk:vda1
I0320 14:21:14.456595 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:21:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:21:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:21:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:21:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:21:16.472407 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:21:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:21:23.409776 543705 memory.go:184] no items to output this cycle
I0320 14:21:23.409781 543705 cpu.go:275] no items to output this cycle
I0320 14:21:24.671991 543705 disk_info.go:125] begin check local disk info of client
I0320 14:21:24.674491 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:21:24.674497 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001faa80 0xc0001faac0]
E0320 14:21:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:21:33.409782 543705 memory.go:184] no items to output this cycle
I0320 14:21:33.409817 543705 cpu.go:275] no items to output this cycle
I0320 14:21:38.508292 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 14:21:38.508298 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 14:21:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:21:43.410579 543705 memory.go:191] Add success.
I0320 14:21:43.409814 543705 cpu.go:282] Add success.
I0320 14:21:43.420159 543705 net.go:770] primary dev: ETH0
I0320 14:21:43.420175 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:21:43.420191 543705 net.go:698] Add success.
I0320 14:21:43.420565 543705 net.go:648] Add success.
I0320 14:21:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:21:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:21:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:21:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:21:53.409785 543705 memory.go:184] no items to output this cycle
I0320 14:21:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 14:22:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:22:03.409770 543705 memory.go:184] no items to output this cycle
I0320 14:22:03.409790 543705 cpu.go:275] no items to output this cycle
E0320 14:22:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:22:13.409816 543705 memory.go:191] Add success.
I0320 14:22:13.409822 543705 cpu.go:282] Add success.
W0320 14:22:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:22:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:22:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:22:13.420463 543705 net.go:648] Add success.
I0320 14:22:13.423621 543705 net.go:770] primary dev: ETH0
I0320 14:22:13.423634 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:22:13.423645 543705 net.go:698] Add success.
W0320 14:22:14.455164 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:22:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0320 14:22:14.455179 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:22:14.456798 543705 disk_worker.go:494] system disk:vda1
I0320 14:22:14.456837 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 14:22:14.457158 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 14:22:14.457165 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 14:22:14.457170 543705 custom_config.go:64] query custom config with name: gpu
E0320 14:22:15.456790 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 14:22:15.456798 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:22:16.457912 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 14:22:16.457914 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 14:22:16.457974 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:22:16.457994 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:22:16.472399 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:22:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:22:23.409801 543705 memory.go:184] no items to output this cycle
I0320 14:22:23.409814 543705 cpu.go:275] no items to output this cycle
I0320 14:22:24.675011 543705 disk_info.go:125] begin check local disk info of client
I0320 14:22:24.677496 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:22:24.677503 543705 disk_info.go:196] parse disk info done, disk is : [0xc00054a840 0xc00054a880]
E0320 14:22:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:22:33.409791 543705 memory.go:184] no items to output this cycle
I0320 14:22:33.409817 543705 cpu.go:275] no items to output this cycle
E0320 14:22:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:22:43.409809 543705 memory.go:191] Add success.
I0320 14:22:43.409819 543705 cpu.go:282] Add success.
I0320 14:22:43.419892 543705 net.go:648] Add success.
I0320 14:22:43.422663 543705 net.go:770] primary dev: ETH0
I0320 14:22:43.422676 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:22:43.422688 543705 net.go:698] Add success.
I0320 14:22:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:22:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:22:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:22:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:22:53.409805 543705 memory.go:184] no items to output this cycle
I0320 14:22:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 14:23:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:23:03.409776 543705 memory.go:184] no items to output this cycle
I0320 14:23:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 14:23:13.409885 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:23:13.409924 543705 memory.go:191] Add success.
I0320 14:23:13.409937 543705 cpu.go:282] Add success.
W0320 14:23:13.409967 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:23:13.409985 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:23:13.409990 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:23:13.419741 543705 net.go:648] Add success.
I0320 14:23:13.422884 543705 net.go:770] primary dev: ETH0
I0320 14:23:13.422899 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:23:13.422913 543705 net.go:698] Add success.
I0320 14:23:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:23:14.455193 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:23:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 14:23:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:23:14.456583 543705 disk_worker.go:494] system disk:vda1
I0320 14:23:14.456614 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:23:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:23:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:23:16.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:23:16.458077 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:23:16.472417 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:23:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:23:23.409776 543705 memory.go:184] no items to output this cycle
I0320 14:23:23.409778 543705 cpu.go:275] no items to output this cycle
I0320 14:23:24.677673 543705 disk_info.go:125] begin check local disk info of client
I0320 14:23:24.680125 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:23:24.680131 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b6280 0xc0003b62c0]
E0320 14:23:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:23:33.409796 543705 memory.go:184] no items to output this cycle
I0320 14:23:33.409800 543705 cpu.go:275] no items to output this cycle
E0320 14:23:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:23:43.409787 543705 memory.go:191] Add success.
I0320 14:23:43.409809 543705 cpu.go:282] Add success.
I0320 14:23:43.420022 543705 net.go:648] Add success.
I0320 14:23:43.422864 543705 net.go:770] primary dev: ETH0
I0320 14:23:43.422879 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:23:43.422892 543705 net.go:698] Add success.
I0320 14:23:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:23:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:23:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:23:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:23:53.409790 543705 memory.go:184] no items to output this cycle
I0320 14:23:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 14:24:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:24:03.409777 543705 memory.go:184] no items to output this cycle
I0320 14:24:03.409782 543705 cpu.go:275] no items to output this cycle
E0320 14:24:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:24:13.409881 543705 memory.go:191] Add success.
W0320 14:24:13.409912 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:24:13.409925 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:24:13.409928 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:24:13.409940 543705 cpu.go:282] Add success.
I0320 14:24:13.419703 543705 net.go:648] Add success.
I0320 14:24:13.422181 543705 net.go:770] primary dev: ETH0
I0320 14:24:13.422195 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:24:13.422206 543705 net.go:698] Add success.
I0320 14:24:13.463844 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1ae20273-7fcb-44da-8efc-7924bdbf156e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 14:24:13.463877 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 14:24:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:24:14.455131 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:24:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0320 14:24:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:24:14.456597 543705 disk_worker.go:494] system disk:vda1
I0320 14:24:14.456628 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:24:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:24:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:24:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:24:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:24:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:24:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:24:23.409803 543705 memory.go:184] no items to output this cycle
I0320 14:24:23.409817 543705 cpu.go:275] no items to output this cycle
I0320 14:24:24.681030 543705 disk_info.go:125] begin check local disk info of client
I0320 14:24:24.683546 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:24:24.683551 543705 disk_info.go:196] parse disk info done, disk is : [0xc00028e580 0xc00028e5c0]
E0320 14:24:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:24:33.409796 543705 memory.go:184] no items to output this cycle
I0320 14:24:33.409801 543705 cpu.go:275] no items to output this cycle
I0320 14:24:38.509290 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 14:24:38.509297 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 14:24:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:24:43.410579 543705 memory.go:191] Add success.
I0320 14:24:43.409815 543705 cpu.go:282] Add success.
I0320 14:24:43.420344 543705 net.go:648] Add success.
I0320 14:24:43.423153 543705 net.go:770] primary dev: ETH0
I0320 14:24:43.423167 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:24:43.423180 543705 net.go:698] Add success.
I0320 14:24:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:24:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:24:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:24:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:24:53.409800 543705 memory.go:184] no items to output this cycle
I0320 14:24:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 14:25:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:25:03.409779 543705 memory.go:184] no items to output this cycle
I0320 14:25:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 14:25:13.409876 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:25:13.409908 543705 memory.go:191] Add success.
W0320 14:25:13.409963 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 14:25:13.409981 543705 cpu.go:282] Add success.
W0320 14:25:13.409984 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:25:13.409989 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:25:13.419730 543705 net.go:648] Add success.
I0320 14:25:13.422498 543705 net.go:770] primary dev: ETH0
I0320 14:25:13.422511 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:25:13.422522 543705 net.go:698] Add success.
I0320 14:25:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:25:14.455104 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:25:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0320 14:25:14.455179 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:25:14.456521 543705 disk_worker.go:494] system disk:vda1
I0320 14:25:14.456566 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:25:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:25:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:25:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:25:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:25:16.472435 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:25:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:25:23.409777 543705 memory.go:184] no items to output this cycle
I0320 14:25:23.409789 543705 cpu.go:275] no items to output this cycle
I0320 14:25:24.683631 543705 disk_info.go:125] begin check local disk info of client
I0320 14:25:24.686126 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:25:24.686132 543705 disk_info.go:196] parse disk info done, disk is : [0xc000331880 0xc0003318c0]
E0320 14:25:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:25:33.409784 543705 memory.go:184] no items to output this cycle
I0320 14:25:33.409821 543705 cpu.go:275] no items to output this cycle
E0320 14:25:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:25:43.409792 543705 memory.go:191] Add success.
I0320 14:25:43.409813 543705 cpu.go:282] Add success.
I0320 14:25:43.419903 543705 net.go:648] Add success.
I0320 14:25:43.422591 543705 net.go:770] primary dev: ETH0
I0320 14:25:43.422605 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:25:43.422620 543705 net.go:698] Add success.
I0320 14:25:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:25:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:25:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:25:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:25:53.409805 543705 memory.go:184] no items to output this cycle
I0320 14:25:53.409816 543705 cpu.go:275] no items to output this cycle
E0320 14:26:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:26:03.409779 543705 memory.go:184] no items to output this cycle
I0320 14:26:03.409782 543705 cpu.go:275] no items to output this cycle
E0320 14:26:13.409894 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:26:13.409933 543705 memory.go:191] Add success.
I0320 14:26:13.409937 543705 cpu.go:282] Add success.
W0320 14:26:13.409976 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:26:13.409995 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:26:13.409999 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:26:13.419727 543705 net.go:648] Add success.
I0320 14:26:13.422668 543705 net.go:770] primary dev: ETH0
I0320 14:26:13.422683 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:26:13.422697 543705 net.go:698] Add success.
I0320 14:26:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:26:14.455146 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:26:14.455156 543705 disk_worker.go:708] disk space is not compliant
W0320 14:26:14.455159 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:26:14.456498 543705 disk_worker.go:494] system disk:vda1
I0320 14:26:14.456541 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:26:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:26:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:26:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:26:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:26:16.472398 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:26:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:26:23.409803 543705 memory.go:184] no items to output this cycle
I0320 14:26:23.409812 543705 cpu.go:275] no items to output this cycle
I0320 14:26:24.686214 543705 disk_info.go:125] begin check local disk info of client
I0320 14:26:24.688793 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:26:24.688799 543705 disk_info.go:196] parse disk info done, disk is : [0xc000513980 0xc000513a00]
E0320 14:26:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:26:33.409798 543705 cpu.go:275] no items to output this cycle
I0320 14:26:33.409800 543705 memory.go:184] no items to output this cycle
E0320 14:26:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:26:43.409808 543705 memory.go:191] Add success.
I0320 14:26:43.409809 543705 cpu.go:282] Add success.
I0320 14:26:43.419878 543705 net.go:648] Add success.
I0320 14:26:43.423298 543705 net.go:770] primary dev: ETH0
I0320 14:26:43.423314 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:26:43.423328 543705 net.go:698] Add success.
I0320 14:26:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:26:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:26:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:26:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:26:53.409780 543705 memory.go:184] no items to output this cycle
I0320 14:26:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 14:27:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:27:03.409775 543705 memory.go:184] no items to output this cycle
I0320 14:27:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 14:27:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:27:13.409802 543705 cpu.go:282] Add success.
I0320 14:27:13.409806 543705 memory.go:191] Add success.
W0320 14:27:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:27:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:27:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:27:13.420116 543705 net.go:648] Add success.
I0320 14:27:13.422963 543705 net.go:770] primary dev: ETH0
I0320 14:27:13.422980 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:27:13.422994 543705 net.go:698] Add success.
I0320 14:27:13.429348 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 14:27:13.453528 543705 event_worker.go:152] Polling the log file for events...
I0320 14:27:13.469775 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"84fe879e-e9d4-4eb4-89a1-f7f8c32a9bc9","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 14:27:13.469810 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 14:27:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:27:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0320 14:27:14.455183 543705 disk_worker.go:728] disk inode is not compliant
E0320 14:27:14.456793 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 14:27:14.456802 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 14:27:14.456807 543705 custom_config.go:64] query custom config with name: gpu
I0320 14:27:14.456849 543705 disk_worker.go:494] system disk:vda1
I0320 14:27:14.456876 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 14:27:15.456814 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 14:27:15.456821 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:27:16.457939 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 14:27:16.457939 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 14:27:16.457993 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:27:16.458012 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:27:16.472399 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:27:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:27:23.409768 543705 memory.go:184] no items to output this cycle
I0320 14:27:23.409794 543705 cpu.go:275] no items to output this cycle
I0320 14:27:24.689672 543705 disk_info.go:125] begin check local disk info of client
I0320 14:27:24.692095 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:27:24.692100 543705 disk_info.go:196] parse disk info done, disk is : [0xc000513b40 0xc000513b80]
E0320 14:27:33.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:27:33.409813 543705 memory.go:184] no items to output this cycle
I0320 14:27:33.409825 543705 cpu.go:275] no items to output this cycle
I0320 14:27:38.510317 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 14:27:38.510324 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 14:27:43.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:27:43.410711 543705 memory.go:191] Add success.
I0320 14:27:43.409815 543705 cpu.go:282] Add success.
I0320 14:27:43.420483 543705 net.go:648] Add success.
I0320 14:27:43.423110 543705 net.go:770] primary dev: ETH0
I0320 14:27:43.423123 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:27:43.423134 543705 net.go:698] Add success.
I0320 14:27:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:27:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:27:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:27:53.410397 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:27:53.410415 543705 memory.go:184] no items to output this cycle
I0320 14:27:53.410427 543705 cpu.go:275] no items to output this cycle
E0320 14:28:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:28:03.409779 543705 memory.go:184] no items to output this cycle
I0320 14:28:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 14:28:13.409971 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:28:13.410058 543705 memory.go:191] Add success.
I0320 14:28:13.410085 543705 cpu.go:282] Add success.
W0320 14:28:13.410093 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:28:13.410115 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:28:13.410119 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:28:13.419750 543705 net.go:648] Add success.
I0320 14:28:13.422494 543705 net.go:770] primary dev: ETH0
I0320 14:28:13.422509 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:28:13.422522 543705 net.go:698] Add success.
I0320 14:28:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:28:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:28:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0320 14:28:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:28:14.456562 543705 disk_worker.go:494] system disk:vda1
I0320 14:28:14.456591 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:28:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:28:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:28:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:28:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:28:16.472421 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:28:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:28:23.409798 543705 memory.go:184] no items to output this cycle
I0320 14:28:23.409812 543705 cpu.go:275] no items to output this cycle
I0320 14:28:24.693098 543705 disk_info.go:125] begin check local disk info of client
I0320 14:28:24.695542 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:28:24.695547 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5ec0 0xc0000c5f00]
E0320 14:28:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:28:33.409783 543705 memory.go:184] no items to output this cycle
I0320 14:28:33.409821 543705 cpu.go:275] no items to output this cycle
E0320 14:28:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:28:43.409794 543705 memory.go:191] Add success.
I0320 14:28:43.409805 543705 cpu.go:282] Add success.
I0320 14:28:43.419859 543705 net.go:648] Add success.
I0320 14:28:43.423217 543705 net.go:770] primary dev: ETH0
I0320 14:28:43.423230 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:28:43.423242 543705 net.go:698] Add success.
I0320 14:28:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:28:46.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:28:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:28:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:28:53.409774 543705 memory.go:184] no items to output this cycle
I0320 14:28:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 14:29:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:29:03.409770 543705 memory.go:184] no items to output this cycle
I0320 14:29:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 14:29:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:29:13.409817 543705 memory.go:191] Add success.
I0320 14:29:13.409819 543705 cpu.go:282] Add success.
W0320 14:29:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:29:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:29:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:29:13.419744 543705 net.go:648] Add success.
I0320 14:29:13.422746 543705 net.go:770] primary dev: ETH0
I0320 14:29:13.422759 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:29:13.422770 543705 net.go:698] Add success.
I0320 14:29:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:29:14.455183 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:29:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 14:29:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:29:14.456580 543705 disk_worker.go:494] system disk:vda1
I0320 14:29:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:29:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:29:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:29:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:29:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:29:16.472354 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:29:23.410252 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:29:23.410266 543705 memory.go:184] no items to output this cycle
I0320 14:29:23.410267 543705 cpu.go:275] no items to output this cycle
I0320 14:29:24.696121 543705 disk_info.go:125] begin check local disk info of client
I0320 14:29:24.698578 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:29:24.698583 543705 disk_info.go:196] parse disk info done, disk is : [0xc000381980 0xc0003819c0]
E0320 14:29:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:29:33.409780 543705 memory.go:184] no items to output this cycle
I0320 14:29:33.409830 543705 cpu.go:275] no items to output this cycle
E0320 14:29:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:29:43.409806 543705 memory.go:191] Add success.
I0320 14:29:43.409813 543705 cpu.go:282] Add success.
I0320 14:29:43.419939 543705 net.go:648] Add success.
I0320 14:29:43.422636 543705 net.go:770] primary dev: ETH0
I0320 14:29:43.422648 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:29:43.422659 543705 net.go:698] Add success.
I0320 14:29:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:29:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:29:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:29:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:29:53.409805 543705 memory.go:184] no items to output this cycle
I0320 14:29:53.409819 543705 cpu.go:275] no items to output this cycle
E0320 14:30:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:30:03.409805 543705 memory.go:184] no items to output this cycle
I0320 14:30:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 14:30:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:30:13.409909 543705 memory.go:191] Add success.
I0320 14:30:13.409937 543705 cpu.go:282] Add success.
W0320 14:30:13.409945 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:30:13.409959 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:30:13.409962 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:30:13.419741 543705 net.go:648] Add success.
I0320 14:30:13.422677 543705 net.go:770] primary dev: ETH0
I0320 14:30:13.422692 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:30:13.422705 543705 net.go:698] Add success.
I0320 14:30:13.468003 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3d2ca136-df07-4192-befa-34ecf4702053","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 14:30:13.468042 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 14:30:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:30:14.455107 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:30:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 14:30:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:30:14.456739 543705 disk_worker.go:494] system disk:vda1
I0320 14:30:14.456768 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:30:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:30:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:30:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:30:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:30:16.472370 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:30:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:30:23.409775 543705 memory.go:184] no items to output this cycle
I0320 14:30:23.409776 543705 cpu.go:275] no items to output this cycle
I0320 14:30:24.698663 543705 disk_info.go:125] begin check local disk info of client
I0320 14:30:24.701179 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:30:24.701186 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa9c0 0xc0001fb040]
E0320 14:30:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:30:33.409786 543705 memory.go:184] no items to output this cycle
I0320 14:30:33.409793 543705 cpu.go:275] no items to output this cycle
I0320 14:30:38.511300 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 14:30:38.511307 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 14:30:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:30:43.410620 543705 memory.go:191] Add success.
I0320 14:30:43.409802 543705 cpu.go:282] Add success.
I0320 14:30:43.420316 543705 net.go:648] Add success.
I0320 14:30:43.423396 543705 net.go:770] primary dev: ETH0
I0320 14:30:43.423411 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:30:43.423422 543705 net.go:698] Add success.
I0320 14:30:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:30:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:30:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:30:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:30:53.409779 543705 memory.go:184] no items to output this cycle
I0320 14:30:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 14:31:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:31:03.409772 543705 memory.go:184] no items to output this cycle
I0320 14:31:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 14:31:13.409890 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:31:13.409919 543705 memory.go:191] Add success.
I0320 14:31:13.409924 543705 cpu.go:282] Add success.
W0320 14:31:13.409951 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:31:13.409966 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:31:13.409972 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:31:13.419724 543705 net.go:648] Add success.
I0320 14:31:13.422186 543705 net.go:770] primary dev: ETH0
I0320 14:31:13.422200 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:31:13.422212 543705 net.go:698] Add success.
I0320 14:31:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:31:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:31:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0320 14:31:14.455180 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:31:14.456545 543705 disk_worker.go:494] system disk:vda1
I0320 14:31:14.456574 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:31:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:31:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:31:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:31:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:31:16.472401 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:31:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:31:23.409763 543705 memory.go:184] no items to output this cycle
I0320 14:31:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 14:31:24.701668 543705 disk_info.go:125] begin check local disk info of client
I0320 14:31:24.704126 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:31:24.704132 543705 disk_info.go:196] parse disk info done, disk is : [0xc000468980 0xc0004689c0]
E0320 14:31:33.409815 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:31:33.409820 543705 cpu.go:275] no items to output this cycle
I0320 14:31:33.409836 543705 memory.go:184] no items to output this cycle
E0320 14:31:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:31:43.409801 543705 memory.go:191] Add success.
I0320 14:31:43.409824 543705 cpu.go:282] Add success.
I0320 14:31:43.419702 543705 net.go:770] primary dev: ETH0
I0320 14:31:43.419718 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:31:43.419732 543705 net.go:698] Add success.
I0320 14:31:43.420077 543705 net.go:648] Add success.
I0320 14:31:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:31:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:31:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:31:53.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:31:53.409816 543705 memory.go:184] no items to output this cycle
I0320 14:31:53.409826 543705 cpu.go:275] no items to output this cycle
E0320 14:32:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:32:03.409789 543705 memory.go:184] no items to output this cycle
I0320 14:32:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 14:32:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:32:13.409792 543705 memory.go:191] Add success.
I0320 14:32:13.409800 543705 cpu.go:282] Add success.
W0320 14:32:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:32:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:32:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:32:13.420241 543705 net.go:648] Add success.
I0320 14:32:13.423028 543705 net.go:770] primary dev: ETH0
I0320 14:32:13.423041 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:32:13.423052 543705 net.go:698] Add success.
W0320 14:32:14.455167 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:32:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 14:32:14.455182 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:32:14.456781 543705 disk_worker.go:494] system disk:vda1
I0320 14:32:14.456819 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 14:32:14.457119 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 14:32:14.457127 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 14:32:14.457131 543705 custom_config.go:64] query custom config with name: gpu
E0320 14:32:15.456801 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 14:32:15.456809 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:32:16.457946 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 14:32:16.457947 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 14:32:16.458000 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:32:16.458020 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:32:16.472333 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:32:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:32:23.409800 543705 memory.go:184] no items to output this cycle
I0320 14:32:23.409812 543705 cpu.go:275] no items to output this cycle
I0320 14:32:24.704212 543705 disk_info.go:125] begin check local disk info of client
I0320 14:32:24.706692 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:32:24.706698 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004af6c0 0xc0004af700]
E0320 14:32:33.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:32:33.409813 543705 memory.go:184] no items to output this cycle
I0320 14:32:33.409857 543705 cpu.go:275] no items to output this cycle
E0320 14:32:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:32:43.409801 543705 memory.go:191] Add success.
I0320 14:32:43.409802 543705 cpu.go:282] Add success.
I0320 14:32:43.419879 543705 net.go:648] Add success.
I0320 14:32:43.422468 543705 net.go:770] primary dev: ETH0
I0320 14:32:43.422482 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:32:43.422498 543705 net.go:698] Add success.
I0320 14:32:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:32:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:32:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:32:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:32:53.409782 543705 memory.go:184] no items to output this cycle
I0320 14:32:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 14:33:03.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:33:03.409819 543705 memory.go:184] no items to output this cycle
I0320 14:33:03.409833 543705 cpu.go:275] no items to output this cycle
E0320 14:33:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:33:13.409789 543705 memory.go:191] Add success.
I0320 14:33:13.409808 543705 cpu.go:282] Add success.
W0320 14:33:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:33:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:33:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:33:13.420357 543705 net.go:648] Add success.
I0320 14:33:13.422973 543705 net.go:770] primary dev: ETH0
I0320 14:33:13.422988 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:33:13.423002 543705 net.go:698] Add success.
I0320 14:33:13.463348 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"00291b22-8ee3-48ef-8bcc-5ab25a007485","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 14:33:13.463378 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 14:33:14.454952 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:33:14.455107 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:33:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 14:33:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:33:14.456575 543705 disk_worker.go:494] system disk:vda1
I0320 14:33:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:33:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:33:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:33:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:33:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:33:16.472380 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:33:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:33:23.409767 543705 memory.go:184] no items to output this cycle
I0320 14:33:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 14:33:24.708176 543705 disk_info.go:125] begin check local disk info of client
I0320 14:33:24.710642 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:33:24.710648 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001abb00 0xc0001abb40]
I0320 14:33:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 14:33:33.409826 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:33:33.409845 543705 memory.go:184] no items to output this cycle
I0320 14:33:38.512326 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 14:33:38.512333 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 14:33:43.409802 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:33:43.410831 543705 memory.go:191] Add success.
I0320 14:33:43.409849 543705 cpu.go:282] Add success.
I0320 14:33:43.420569 543705 net.go:648] Add success.
I0320 14:33:43.423454 543705 net.go:770] primary dev: ETH0
I0320 14:33:43.423468 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:33:43.423480 543705 net.go:698] Add success.
I0320 14:33:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:33:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:33:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:33:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:33:53.409817 543705 memory.go:184] no items to output this cycle
I0320 14:33:53.409824 543705 cpu.go:275] no items to output this cycle
E0320 14:34:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:34:03.409786 543705 memory.go:184] no items to output this cycle
I0320 14:34:03.409791 543705 cpu.go:275] no items to output this cycle
W0320 14:34:13.409712 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:34:13.409729 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:34:13.409734 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:34:13.409800 543705 cpu.go:282] Add success.
E0320 14:34:13.409852 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:34:13.409874 543705 memory.go:191] Add success.
I0320 14:34:13.420338 543705 net.go:648] Add success.
I0320 14:34:13.423019 543705 net.go:770] primary dev: ETH0
I0320 14:34:13.423031 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:34:13.423042 543705 net.go:698] Add success.
I0320 14:34:14.454942 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:34:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:34:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 14:34:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:34:14.456543 543705 disk_worker.go:494] system disk:vda1
I0320 14:34:14.456572 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:34:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:34:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:34:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:34:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:34:16.472482 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:34:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:34:23.409765 543705 memory.go:184] no items to output this cycle
I0320 14:34:23.409800 543705 cpu.go:275] no items to output this cycle
I0320 14:34:24.711192 543705 disk_info.go:125] begin check local disk info of client
I0320 14:34:24.713637 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:34:24.713658 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fbec0 0xc0001fbf00]
E0320 14:34:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:34:33.409779 543705 memory.go:184] no items to output this cycle
I0320 14:34:33.409841 543705 cpu.go:275] no items to output this cycle
E0320 14:34:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:34:43.409795 543705 memory.go:191] Add success.
I0320 14:34:43.409822 543705 cpu.go:282] Add success.
I0320 14:34:43.420075 543705 net.go:648] Add success.
I0320 14:34:43.423132 543705 net.go:770] primary dev: ETH0
I0320 14:34:43.423147 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:34:43.423162 543705 net.go:698] Add success.
I0320 14:34:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:34:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:34:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:34:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:34:53.409808 543705 memory.go:184] no items to output this cycle
I0320 14:34:53.409821 543705 cpu.go:275] no items to output this cycle
E0320 14:35:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:35:03.409800 543705 memory.go:184] no items to output this cycle
I0320 14:35:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 14:35:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:35:13.409807 543705 memory.go:191] Add success.
I0320 14:35:13.409808 543705 cpu.go:282] Add success.
W0320 14:35:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:35:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:35:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:35:13.420126 543705 net.go:648] Add success.
I0320 14:35:13.422830 543705 net.go:770] primary dev: ETH0
I0320 14:35:13.422842 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:35:13.422853 543705 net.go:698] Add success.
I0320 14:35:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:35:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:35:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 14:35:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:35:14.459088 543705 disk_worker.go:494] system disk:vda1
I0320 14:35:14.459115 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:35:15.455919 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:35:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:35:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:35:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:35:16.472380 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:35:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:35:23.409803 543705 memory.go:184] no items to output this cycle
I0320 14:35:23.409814 543705 cpu.go:275] no items to output this cycle
I0320 14:35:24.714726 543705 disk_info.go:125] begin check local disk info of client
I0320 14:35:24.717159 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:35:24.717164 543705 disk_info.go:196] parse disk info done, disk is : [0xc000234540 0xc000234580]
E0320 14:35:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:35:33.409805 543705 memory.go:184] no items to output this cycle
I0320 14:35:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 14:35:43.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:35:43.409831 543705 memory.go:191] Add success.
I0320 14:35:43.409836 543705 cpu.go:282] Add success.
I0320 14:35:43.420004 543705 net.go:648] Add success.
I0320 14:35:43.422759 543705 net.go:770] primary dev: ETH0
I0320 14:35:43.422772 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:35:43.422785 543705 net.go:698] Add success.
I0320 14:35:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:35:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:35:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:35:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:35:53.409807 543705 memory.go:184] no items to output this cycle
I0320 14:35:53.409816 543705 cpu.go:275] no items to output this cycle
E0320 14:36:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:36:03.409793 543705 memory.go:184] no items to output this cycle
I0320 14:36:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 14:36:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:36:13.409792 543705 memory.go:191] Add success.
I0320 14:36:13.409791 543705 cpu.go:282] Add success.
W0320 14:36:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:36:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:36:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:36:13.420114 543705 net.go:648] Add success.
I0320 14:36:13.423220 543705 net.go:770] primary dev: ETH0
I0320 14:36:13.423239 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:36:13.423268 543705 net.go:698] Add success.
I0320 14:36:13.935066 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a4fc6d2e-271f-4299-afe9-cc38fe4048a6","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 14:36:13.935099 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 14:36:14.454731 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:36:14.454990 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:36:14.455091 543705 disk_worker.go:708] disk space is not compliant
W0320 14:36:14.455096 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:36:14.456783 543705 disk_worker.go:494] system disk:vda1
I0320 14:36:14.456811 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:36:15.455605 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:36:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:36:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:36:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:36:16.472410 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:36:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:36:23.409778 543705 memory.go:184] no items to output this cycle
I0320 14:36:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 14:36:24.717669 543705 disk_info.go:125] begin check local disk info of client
I0320 14:36:24.720101 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:36:24.720106 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003d9440 0xc0003d9480]
E0320 14:36:33.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:36:33.409794 543705 cpu.go:275] no items to output this cycle
I0320 14:36:33.409811 543705 memory.go:184] no items to output this cycle
I0320 14:36:38.513331 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 14:36:38.513339 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 14:36:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:36:43.410638 543705 memory.go:191] Add success.
I0320 14:36:43.409831 543705 cpu.go:282] Add success.
I0320 14:36:43.420386 543705 net.go:648] Add success.
I0320 14:36:43.423328 543705 net.go:770] primary dev: ETH0
I0320 14:36:43.423340 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:36:43.423353 543705 net.go:698] Add success.
I0320 14:36:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:36:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:36:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:36:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:36:53.409773 543705 memory.go:184] no items to output this cycle
I0320 14:36:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 14:37:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:37:03.409809 543705 memory.go:184] no items to output this cycle
I0320 14:37:03.409824 543705 cpu.go:275] no items to output this cycle
W0320 14:37:13.409699 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:37:13.409714 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:37:13.409718 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0320 14:37:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:37:13.409811 543705 memory.go:191] Add success.
I0320 14:37:13.409816 543705 cpu.go:282] Add success.
I0320 14:37:13.420052 543705 net.go:648] Add success.
I0320 14:37:13.422896 543705 net.go:770] primary dev: ETH0
I0320 14:37:13.422908 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:37:13.422921 543705 net.go:698] Add success.
I0320 14:37:13.453443 543705 event_worker.go:152] Polling the log file for events...
W0320 14:37:14.455322 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:37:14.455457 543705 disk_worker.go:708] disk space is not compliant
W0320 14:37:14.455462 543705 disk_worker.go:728] disk inode is not compliant
E0320 14:37:14.456126 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 14:37:14.456134 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 14:37:14.456138 543705 custom_config.go:64] query custom config with name: gpu
I0320 14:37:14.457249 543705 disk_worker.go:494] system disk:vda1
I0320 14:37:14.457290 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 14:37:15.456867 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 14:37:15.456877 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:37:16.457944 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 14:37:16.457954 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 14:37:16.457997 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:37:16.458014 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:37:16.472330 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:37:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:37:23.409773 543705 cpu.go:275] no items to output this cycle
I0320 14:37:23.409784 543705 memory.go:184] no items to output this cycle
I0320 14:37:24.721247 543705 disk_info.go:125] begin check local disk info of client
I0320 14:37:24.723711 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:37:24.723717 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004700c0 0xc000470100]
E0320 14:37:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:37:33.409790 543705 memory.go:184] no items to output this cycle
I0320 14:37:33.409805 543705 cpu.go:275] no items to output this cycle
E0320 14:37:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:37:43.409789 543705 memory.go:191] Add success.
I0320 14:37:43.409811 543705 cpu.go:282] Add success.
I0320 14:37:43.420038 543705 net.go:648] Add success.
I0320 14:37:43.422939 543705 net.go:770] primary dev: ETH0
I0320 14:37:43.422954 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:37:43.422967 543705 net.go:698] Add success.
I0320 14:37:46.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:37:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:37:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:37:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:37:53.409784 543705 memory.go:184] no items to output this cycle
I0320 14:37:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 14:38:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:38:03.409772 543705 memory.go:184] no items to output this cycle
I0320 14:38:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 14:38:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:38:13.409811 543705 memory.go:191] Add success.
I0320 14:38:13.409816 543705 cpu.go:282] Add success.
W0320 14:38:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:38:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:38:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:38:13.420072 543705 net.go:648] Add success.
I0320 14:38:13.423100 543705 net.go:770] primary dev: ETH0
I0320 14:38:13.423113 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:38:13.423126 543705 net.go:698] Add success.
I0320 14:38:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:38:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:38:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0320 14:38:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:38:14.456588 543705 disk_worker.go:494] system disk:vda1
I0320 14:38:14.456616 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:38:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:38:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:38:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:38:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:38:16.472396 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:38:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:38:23.409770 543705 memory.go:184] no items to output this cycle
I0320 14:38:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 14:38:24.725260 543705 disk_info.go:125] begin check local disk info of client
I0320 14:38:24.727782 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:38:24.727787 543705 disk_info.go:196] parse disk info done, disk is : [0xc00049e8c0 0xc00049e900]
E0320 14:38:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:38:33.409798 543705 memory.go:184] no items to output this cycle
I0320 14:38:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 14:38:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:38:43.409800 543705 memory.go:191] Add success.
I0320 14:38:43.409800 543705 cpu.go:282] Add success.
I0320 14:38:43.419977 543705 net.go:648] Add success.
I0320 14:38:43.422702 543705 net.go:770] primary dev: ETH0
I0320 14:38:43.422716 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:38:43.422730 543705 net.go:698] Add success.
I0320 14:38:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:38:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:38:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:38:53.410243 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:38:53.410269 543705 memory.go:184] no items to output this cycle
I0320 14:38:53.410279 543705 cpu.go:275] no items to output this cycle
E0320 14:39:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:39:03.409786 543705 memory.go:184] no items to output this cycle
I0320 14:39:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 14:39:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:39:13.409810 543705 memory.go:191] Add success.
I0320 14:39:13.409818 543705 cpu.go:282] Add success.
W0320 14:39:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:39:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:39:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:39:13.420097 543705 net.go:648] Add success.
I0320 14:39:13.422896 543705 net.go:770] primary dev: ETH0
I0320 14:39:13.422913 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:39:13.422927 543705 net.go:698] Add success.
I0320 14:39:13.469717 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"eb53bf53-338e-4e28-befa-97b866615c09","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 14:39:13.469751 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 14:39:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:39:14.455206 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:39:14.455216 543705 disk_worker.go:708] disk space is not compliant
W0320 14:39:14.455219 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:39:14.456899 543705 disk_worker.go:494] system disk:vda1
I0320 14:39:14.456928 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:39:15.455613 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:39:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:39:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:39:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:39:16.472378 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:39:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:39:23.409781 543705 memory.go:184] no items to output this cycle
I0320 14:39:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 14:39:24.729280 543705 disk_info.go:125] begin check local disk info of client
I0320 14:39:24.731718 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:39:24.731724 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4100 0xc0000c4140]
E0320 14:39:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:39:33.409782 543705 cpu.go:275] no items to output this cycle
I0320 14:39:33.409784 543705 memory.go:184] no items to output this cycle
I0320 14:39:38.514329 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 14:39:38.514336 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 14:39:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:39:43.410920 543705 memory.go:191] Add success.
I0320 14:39:43.409803 543705 cpu.go:282] Add success.
I0320 14:39:43.420651 543705 net.go:648] Add success.
I0320 14:39:43.423575 543705 net.go:770] primary dev: ETH0
I0320 14:39:43.423588 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:39:43.423600 543705 net.go:698] Add success.
I0320 14:39:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:39:46.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:39:46.458050 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:39:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:39:53.409784 543705 memory.go:184] no items to output this cycle
I0320 14:39:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 14:40:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:40:03.409771 543705 memory.go:184] no items to output this cycle
I0320 14:40:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 14:40:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:40:13.409807 543705 memory.go:191] Add success.
I0320 14:40:13.409827 543705 cpu.go:282] Add success.
W0320 14:40:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:40:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:40:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:40:13.420195 543705 net.go:648] Add success.
I0320 14:40:13.422998 543705 net.go:770] primary dev: ETH0
I0320 14:40:13.423017 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:40:13.423033 543705 net.go:698] Add success.
I0320 14:40:14.454955 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:40:14.455116 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:40:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0320 14:40:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:40:14.456564 543705 disk_worker.go:494] system disk:vda1
I0320 14:40:14.456600 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:40:15.455949 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:40:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:40:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:40:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:40:16.472378 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:40:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:40:23.409763 543705 memory.go:184] no items to output this cycle
I0320 14:40:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 14:40:24.733301 543705 disk_info.go:125] begin check local disk info of client
I0320 14:40:24.735745 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:40:24.735751 543705 disk_info.go:196] parse disk info done, disk is : [0xc000483000 0xc000483040]
E0320 14:40:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:40:33.409764 543705 memory.go:184] no items to output this cycle
I0320 14:40:33.409790 543705 cpu.go:275] no items to output this cycle
E0320 14:40:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:40:43.409828 543705 memory.go:191] Add success.
I0320 14:40:43.409835 543705 cpu.go:282] Add success.
I0320 14:40:43.419912 543705 net.go:648] Add success.
I0320 14:40:43.422917 543705 net.go:770] primary dev: ETH0
I0320 14:40:43.422932 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:40:43.422946 543705 net.go:698] Add success.
I0320 14:40:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:40:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:40:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:40:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:40:53.409774 543705 memory.go:184] no items to output this cycle
I0320 14:40:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 14:41:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:41:03.409795 543705 memory.go:184] no items to output this cycle
I0320 14:41:03.409809 543705 cpu.go:275] no items to output this cycle
E0320 14:41:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:41:13.409796 543705 memory.go:191] Add success.
I0320 14:41:13.409801 543705 cpu.go:282] Add success.
W0320 14:41:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:41:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:41:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:41:13.420241 543705 net.go:648] Add success.
I0320 14:41:13.422942 543705 net.go:770] primary dev: ETH0
I0320 14:41:13.422956 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:41:13.422969 543705 net.go:698] Add success.
I0320 14:41:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:41:14.455104 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:41:14.455164 543705 disk_worker.go:708] disk space is not compliant
W0320 14:41:14.455167 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:41:14.456506 543705 disk_worker.go:494] system disk:vda1
I0320 14:41:14.456548 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:41:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:41:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:41:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:41:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:41:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:41:23.409878 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:41:23.409895 543705 memory.go:184] no items to output this cycle
I0320 14:41:23.409954 543705 cpu.go:275] no items to output this cycle
I0320 14:41:24.735834 543705 disk_info.go:125] begin check local disk info of client
I0320 14:41:24.738326 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:41:24.738332 543705 disk_info.go:196] parse disk info done, disk is : [0xc000390000 0xc000390040]
E0320 14:41:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:41:33.409794 543705 memory.go:184] no items to output this cycle
I0320 14:41:33.409807 543705 cpu.go:275] no items to output this cycle
E0320 14:41:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:41:43.409792 543705 memory.go:191] Add success.
I0320 14:41:43.409809 543705 cpu.go:282] Add success.
I0320 14:41:43.419902 543705 net.go:648] Add success.
I0320 14:41:43.422715 543705 net.go:770] primary dev: ETH0
I0320 14:41:43.422730 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:41:43.422744 543705 net.go:698] Add success.
I0320 14:41:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:41:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:41:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:41:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:41:53.409774 543705 memory.go:184] no items to output this cycle
I0320 14:41:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 14:42:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:42:03.409775 543705 memory.go:184] no items to output this cycle
I0320 14:42:03.409781 543705 cpu.go:275] no items to output this cycle
E0320 14:42:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:42:13.409805 543705 memory.go:191] Add success.
I0320 14:42:13.409813 543705 cpu.go:282] Add success.
W0320 14:42:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:42:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:42:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:42:13.420108 543705 net.go:648] Add success.
I0320 14:42:13.422913 543705 net.go:770] primary dev: ETH0
I0320 14:42:13.422928 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:42:13.422943 543705 net.go:698] Add success.
I0320 14:42:13.470176 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"81eb2364-6bc0-4456-afeb-54526256c564","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 14:42:13.470209 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 14:42:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:42:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 14:42:14.455186 543705 disk_worker.go:728] disk inode is not compliant
E0320 14:42:14.455869 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 14:42:14.455876 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 14:42:14.455880 543705 custom_config.go:64] query custom config with name: gpu
I0320 14:42:14.456836 543705 disk_worker.go:494] system disk:vda1
I0320 14:42:14.456867 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 14:42:15.456878 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 14:42:15.456887 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:42:16.457945 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 14:42:16.457955 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 14:42:16.457997 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:42:16.458015 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:42:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:42:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:42:23.409784 543705 memory.go:184] no items to output this cycle
I0320 14:42:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 14:42:24.739340 543705 disk_info.go:125] begin check local disk info of client
I0320 14:42:24.741773 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:42:24.741778 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa080 0xc0001aa0c0]
E0320 14:42:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:42:33.409782 543705 memory.go:184] no items to output this cycle
I0320 14:42:33.409790 543705 cpu.go:275] no items to output this cycle
I0320 14:42:38.515344 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 14:42:38.515352 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 14:42:43.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:42:43.410818 543705 memory.go:191] Add success.
I0320 14:42:43.409836 543705 cpu.go:282] Add success.
I0320 14:42:43.420505 543705 net.go:648] Add success.
I0320 14:42:43.423472 543705 net.go:770] primary dev: ETH0
I0320 14:42:43.423485 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:42:43.423498 543705 net.go:698] Add success.
I0320 14:42:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:42:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:42:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:42:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:42:53.409797 543705 memory.go:184] no items to output this cycle
I0320 14:42:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 14:43:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:43:03.409782 543705 memory.go:184] no items to output this cycle
I0320 14:43:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 14:43:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:43:13.409797 543705 memory.go:191] Add success.
I0320 14:43:13.409798 543705 cpu.go:282] Add success.
W0320 14:43:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:43:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:43:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:43:13.420069 543705 net.go:648] Add success.
I0320 14:43:13.422718 543705 net.go:770] primary dev: ETH0
I0320 14:43:13.422733 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:43:13.422745 543705 net.go:698] Add success.
I0320 14:43:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:43:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:43:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0320 14:43:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:43:14.456567 543705 disk_worker.go:494] system disk:vda1
I0320 14:43:14.456596 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:43:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:43:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:43:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:43:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:43:16.472382 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:43:23.410331 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:43:23.410345 543705 memory.go:184] no items to output this cycle
I0320 14:43:23.410345 543705 cpu.go:275] no items to output this cycle
I0320 14:43:24.743340 543705 disk_info.go:125] begin check local disk info of client
I0320 14:43:24.745779 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:43:24.745785 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003d5840 0xc0003d5880]
E0320 14:43:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:43:33.409788 543705 memory.go:184] no items to output this cycle
I0320 14:43:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 14:43:43.409803 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:43:43.409830 543705 memory.go:191] Add success.
I0320 14:43:43.409831 543705 cpu.go:282] Add success.
I0320 14:43:43.419914 543705 net.go:648] Add success.
I0320 14:43:43.422756 543705 net.go:770] primary dev: ETH0
I0320 14:43:43.422769 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:43:43.422782 543705 net.go:698] Add success.
I0320 14:43:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:43:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:43:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:43:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:43:53.409791 543705 memory.go:184] no items to output this cycle
I0320 14:43:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 14:44:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:44:03.409802 543705 memory.go:184] no items to output this cycle
I0320 14:44:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 14:44:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:44:13.409794 543705 memory.go:191] Add success.
I0320 14:44:13.409812 543705 cpu.go:282] Add success.
W0320 14:44:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:44:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:44:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:44:13.420061 543705 net.go:648] Add success.
I0320 14:44:13.423454 543705 net.go:770] primary dev: ETH0
I0320 14:44:13.423469 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:44:13.423484 543705 net.go:698] Add success.
I0320 14:44:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:44:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:44:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 14:44:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:44:14.456490 543705 disk_worker.go:494] system disk:vda1
I0320 14:44:14.456535 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:44:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:44:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:44:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:44:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:44:16.472427 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:44:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:44:23.409769 543705 memory.go:184] no items to output this cycle
I0320 14:44:23.409797 543705 cpu.go:275] no items to output this cycle
I0320 14:44:24.747372 543705 disk_info.go:125] begin check local disk info of client
I0320 14:44:24.749839 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:44:24.749846 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004ae000 0xc0004ae040]
E0320 14:44:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:44:33.409772 543705 memory.go:184] no items to output this cycle
I0320 14:44:33.409802 543705 cpu.go:275] no items to output this cycle
E0320 14:44:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:44:43.409807 543705 memory.go:191] Add success.
I0320 14:44:43.409813 543705 cpu.go:282] Add success.
I0320 14:44:43.419931 543705 net.go:648] Add success.
I0320 14:44:43.422912 543705 net.go:770] primary dev: ETH0
I0320 14:44:43.422925 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:44:43.422937 543705 net.go:698] Add success.
I0320 14:44:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:44:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:44:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:44:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:44:53.409787 543705 cpu.go:275] no items to output this cycle
I0320 14:44:53.409789 543705 memory.go:184] no items to output this cycle
E0320 14:45:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:45:03.409780 543705 cpu.go:275] no items to output this cycle
I0320 14:45:03.409783 543705 memory.go:184] no items to output this cycle
E0320 14:45:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:45:13.409782 543705 memory.go:191] Add success.
I0320 14:45:13.409785 543705 cpu.go:282] Add success.
W0320 14:45:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:45:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:45:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:45:13.420185 543705 net.go:648] Add success.
I0320 14:45:13.422730 543705 net.go:770] primary dev: ETH0
I0320 14:45:13.422742 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:45:13.422753 543705 net.go:698] Add success.
I0320 14:45:13.687295 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1d2d2433-f534-4421-b87b-641f45791942","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 14:45:13.687333 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 14:45:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:45:14.455129 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:45:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0320 14:45:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:45:14.456860 543705 disk_worker.go:494] system disk:vda1
I0320 14:45:14.456889 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:45:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:45:16.457564 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:45:16.457625 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:45:16.457716 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:45:16.472971 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:45:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:45:23.409778 543705 memory.go:184] no items to output this cycle
I0320 14:45:23.409779 543705 cpu.go:275] no items to output this cycle
I0320 14:45:24.751391 543705 disk_info.go:125] begin check local disk info of client
I0320 14:45:24.753822 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:45:24.753828 543705 disk_info.go:196] parse disk info done, disk is : [0xc00057a400 0xc00057a440]
E0320 14:45:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:45:33.409772 543705 memory.go:184] no items to output this cycle
I0320 14:45:33.409789 543705 cpu.go:275] no items to output this cycle
I0320 14:45:38.516335 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 14:45:38.516344 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 14:45:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:45:43.410786 543705 memory.go:191] Add success.
I0320 14:45:43.409826 543705 cpu.go:282] Add success.
I0320 14:45:43.420533 543705 net.go:648] Add success.
I0320 14:45:43.423171 543705 net.go:770] primary dev: ETH0
I0320 14:45:43.423187 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:45:43.423201 543705 net.go:698] Add success.
I0320 14:45:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:45:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:45:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:45:53.410250 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:45:53.410267 543705 memory.go:184] no items to output this cycle
I0320 14:45:53.410274 543705 cpu.go:275] no items to output this cycle
E0320 14:46:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:46:03.409801 543705 memory.go:184] no items to output this cycle
I0320 14:46:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 14:46:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:46:13.409787 543705 cpu.go:282] Add success.
I0320 14:46:13.409792 543705 memory.go:191] Add success.
W0320 14:46:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:46:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:46:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:46:13.420109 543705 net.go:648] Add success.
I0320 14:46:13.422732 543705 net.go:770] primary dev: ETH0
I0320 14:46:13.422745 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:46:13.422757 543705 net.go:698] Add success.
I0320 14:46:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:46:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:46:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 14:46:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:46:14.456518 543705 disk_worker.go:494] system disk:vda1
I0320 14:46:14.456563 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:46:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:46:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:46:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:46:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:46:16.472362 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:46:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:46:23.409766 543705 memory.go:184] no items to output this cycle
I0320 14:46:23.409813 543705 cpu.go:275] no items to output this cycle
I0320 14:46:24.755421 543705 disk_info.go:125] begin check local disk info of client
I0320 14:46:24.757883 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:46:24.757890 543705 disk_info.go:196] parse disk info done, disk is : [0xc000586000 0xc000586040]
E0320 14:46:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:46:33.409801 543705 memory.go:184] no items to output this cycle
I0320 14:46:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 14:46:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:46:43.409827 543705 memory.go:191] Add success.
I0320 14:46:43.409836 543705 cpu.go:282] Add success.
I0320 14:46:43.420125 543705 net.go:648] Add success.
I0320 14:46:43.422952 543705 net.go:770] primary dev: ETH0
I0320 14:46:43.422965 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:46:43.422978 543705 net.go:698] Add success.
I0320 14:46:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:46:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:46:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:46:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:46:53.409784 543705 cpu.go:275] no items to output this cycle
I0320 14:46:53.409792 543705 memory.go:184] no items to output this cycle
E0320 14:47:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:47:03.409779 543705 memory.go:184] no items to output this cycle
I0320 14:47:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 14:47:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:47:13.409805 543705 memory.go:191] Add success.
I0320 14:47:13.409817 543705 cpu.go:282] Add success.
W0320 14:47:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:47:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:47:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:47:13.420062 543705 net.go:648] Add success.
I0320 14:47:13.422771 543705 net.go:770] primary dev: ETH0
I0320 14:47:13.422783 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:47:13.422795 543705 net.go:698] Add success.
I0320 14:47:13.453356 543705 event_worker.go:152] Polling the log file for events...
W0320 14:47:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:47:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0320 14:47:14.455172 543705 disk_worker.go:728] disk inode is not compliant
E0320 14:47:14.456903 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 14:47:14.456913 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 14:47:14.456919 543705 custom_config.go:64] query custom config with name: gpu
I0320 14:47:14.456985 543705 disk_worker.go:494] system disk:vda1
I0320 14:47:14.457028 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 14:47:15.456814 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 14:47:15.456823 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:47:16.457893 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 14:47:16.457905 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 14:47:16.457946 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:47:16.457963 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:47:16.472322 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:47:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:47:23.409779 543705 memory.go:184] no items to output this cycle
I0320 14:47:23.409781 543705 cpu.go:275] no items to output this cycle
I0320 14:47:24.757971 543705 disk_info.go:125] begin check local disk info of client
I0320 14:47:24.760425 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:47:24.760431 543705 disk_info.go:196] parse disk info done, disk is : [0xc000390980 0xc0003909c0]
E0320 14:47:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:47:33.409769 543705 memory.go:184] no items to output this cycle
I0320 14:47:33.409797 543705 cpu.go:275] no items to output this cycle
E0320 14:47:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:47:43.409799 543705 memory.go:191] Add success.
I0320 14:47:43.409799 543705 cpu.go:282] Add success.
I0320 14:47:43.420067 543705 net.go:648] Add success.
I0320 14:47:43.422860 543705 net.go:770] primary dev: ETH0
I0320 14:47:43.422873 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:47:43.422885 543705 net.go:698] Add success.
I0320 14:47:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:47:46.458062 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:47:46.458089 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:47:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:47:53.409776 543705 memory.go:184] no items to output this cycle
I0320 14:47:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 14:48:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:48:03.409799 543705 memory.go:184] no items to output this cycle
I0320 14:48:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 14:48:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:48:13.409789 543705 memory.go:191] Add success.
I0320 14:48:13.409813 543705 cpu.go:282] Add success.
W0320 14:48:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:48:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:48:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:48:13.420033 543705 net.go:648] Add success.
I0320 14:48:13.422850 543705 net.go:770] primary dev: ETH0
I0320 14:48:13.422863 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:48:13.422875 543705 net.go:698] Add success.
I0320 14:48:13.464644 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"effec298-9aaa-461c-a21d-c788419aa290","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 14:48:13.464679 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 14:48:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:48:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:48:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0320 14:48:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:48:14.456607 543705 disk_worker.go:494] system disk:vda1
I0320 14:48:14.456652 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:48:15.455610 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:48:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:48:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:48:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:48:16.472384 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:48:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:48:23.409792 543705 memory.go:184] no items to output this cycle
I0320 14:48:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 14:48:24.761432 543705 disk_info.go:125] begin check local disk info of client
I0320 14:48:24.763861 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:48:24.763866 543705 disk_info.go:196] parse disk info done, disk is : [0xc00046b480 0xc00046b4c0]
E0320 14:48:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:48:33.409791 543705 memory.go:184] no items to output this cycle
I0320 14:48:33.409803 543705 cpu.go:275] no items to output this cycle
I0320 14:48:38.517342 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 14:48:38.517350 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 14:48:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:48:43.410672 543705 memory.go:191] Add success.
I0320 14:48:43.409811 543705 cpu.go:282] Add success.
I0320 14:48:43.420437 543705 net.go:648] Add success.
I0320 14:48:43.423284 543705 net.go:770] primary dev: ETH0
I0320 14:48:43.423299 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:48:43.423315 543705 net.go:698] Add success.
I0320 14:48:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:48:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:48:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:48:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:48:53.409810 543705 memory.go:184] no items to output this cycle
I0320 14:48:53.409817 543705 cpu.go:275] no items to output this cycle
E0320 14:49:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:49:03.409769 543705 memory.go:184] no items to output this cycle
I0320 14:49:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 14:49:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:49:13.409789 543705 memory.go:191] Add success.
I0320 14:49:13.409792 543705 cpu.go:282] Add success.
W0320 14:49:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:49:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:49:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:49:13.420055 543705 net.go:648] Add success.
I0320 14:49:13.422871 543705 net.go:770] primary dev: ETH0
I0320 14:49:13.422883 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:49:13.422895 543705 net.go:698] Add success.
I0320 14:49:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:49:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:49:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0320 14:49:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:49:14.456994 543705 disk_worker.go:494] system disk:vda1
I0320 14:49:14.457023 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:49:15.456013 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:49:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:49:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:49:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:49:16.472366 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:49:23.410245 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:49:23.410260 543705 memory.go:184] no items to output this cycle
I0320 14:49:23.410266 543705 cpu.go:275] no items to output this cycle
I0320 14:49:24.765463 543705 disk_info.go:125] begin check local disk info of client
I0320 14:49:24.767928 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:49:24.767934 543705 disk_info.go:196] parse disk info done, disk is : [0xc0005192c0 0xc000519300]
E0320 14:49:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:49:33.409774 543705 memory.go:184] no items to output this cycle
I0320 14:49:33.409780 543705 cpu.go:275] no items to output this cycle
E0320 14:49:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:49:43.409790 543705 memory.go:191] Add success.
I0320 14:49:43.409808 543705 cpu.go:282] Add success.
I0320 14:49:43.419879 543705 net.go:648] Add success.
I0320 14:49:43.422731 543705 net.go:770] primary dev: ETH0
I0320 14:49:43.422744 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:49:43.422756 543705 net.go:698] Add success.
I0320 14:49:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:49:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:49:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:49:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:49:53.409809 543705 memory.go:184] no items to output this cycle
I0320 14:49:53.409816 543705 cpu.go:275] no items to output this cycle
E0320 14:50:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:50:03.409777 543705 memory.go:184] no items to output this cycle
I0320 14:50:03.409802 543705 cpu.go:275] no items to output this cycle
E0320 14:50:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:50:13.409819 543705 memory.go:191] Add success.
I0320 14:50:13.409829 543705 cpu.go:282] Add success.
W0320 14:50:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:50:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:50:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:50:13.420062 543705 net.go:648] Add success.
I0320 14:50:13.423001 543705 net.go:770] primary dev: ETH0
I0320 14:50:13.423015 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:50:13.423029 543705 net.go:698] Add success.
I0320 14:50:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:50:14.455247 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:50:14.455335 543705 disk_worker.go:708] disk space is not compliant
W0320 14:50:14.455340 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:50:14.456969 543705 disk_worker.go:494] system disk:vda1
I0320 14:50:14.457002 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:50:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:50:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:50:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:50:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:50:16.472376 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:50:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:50:23.409784 543705 memory.go:184] no items to output this cycle
I0320 14:50:23.409805 543705 cpu.go:275] no items to output this cycle
I0320 14:50:24.769492 543705 disk_info.go:125] begin check local disk info of client
I0320 14:50:24.771950 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:50:24.771956 543705 disk_info.go:196] parse disk info done, disk is : [0xc000262300 0xc000262340]
E0320 14:50:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:50:33.409771 543705 memory.go:184] no items to output this cycle
I0320 14:50:33.409806 543705 cpu.go:275] no items to output this cycle
E0320 14:50:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:50:43.409792 543705 memory.go:191] Add success.
I0320 14:50:43.409821 543705 cpu.go:282] Add success.
I0320 14:50:43.419910 543705 net.go:648] Add success.
I0320 14:50:43.422459 543705 net.go:770] primary dev: ETH0
I0320 14:50:43.422475 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:50:43.422490 543705 net.go:698] Add success.
I0320 14:50:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:50:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:50:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:50:53.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:50:53.409812 543705 memory.go:184] no items to output this cycle
I0320 14:50:53.409819 543705 cpu.go:275] no items to output this cycle
E0320 14:51:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:51:03.409810 543705 memory.go:184] no items to output this cycle
I0320 14:51:03.409823 543705 cpu.go:275] no items to output this cycle
E0320 14:51:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:51:13.409792 543705 memory.go:191] Add success.
I0320 14:51:13.409811 543705 cpu.go:282] Add success.
W0320 14:51:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:51:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:51:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:51:13.420071 543705 net.go:648] Add success.
I0320 14:51:13.422734 543705 net.go:770] primary dev: ETH0
I0320 14:51:13.422749 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:51:13.422763 543705 net.go:698] Add success.
I0320 14:51:13.469437 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"07381519-2296-4bdc-b06f-11baa0ddc5b9","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 14:51:13.469470 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 14:51:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:51:14.455133 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:51:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 14:51:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:51:14.457034 543705 disk_worker.go:494] system disk:vda1
I0320 14:51:14.457063 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:51:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:51:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:51:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:51:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:51:16.472401 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:51:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:51:23.409779 543705 memory.go:184] no items to output this cycle
I0320 14:51:23.409780 543705 cpu.go:275] no items to output this cycle
I0320 14:51:24.773501 543705 disk_info.go:125] begin check local disk info of client
I0320 14:51:24.775950 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:51:24.775956 543705 disk_info.go:196] parse disk info done, disk is : [0xc00036b680 0xc00036b6c0]
E0320 14:51:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:51:33.409776 543705 memory.go:184] no items to output this cycle
I0320 14:51:33.409810 543705 cpu.go:275] no items to output this cycle
I0320 14:51:38.518357 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 14:51:38.518364 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 14:51:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:51:43.410737 543705 memory.go:191] Add success.
I0320 14:51:43.409816 543705 cpu.go:282] Add success.
I0320 14:51:43.420489 543705 net.go:648] Add success.
I0320 14:51:43.423266 543705 net.go:770] primary dev: ETH0
I0320 14:51:43.423280 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:51:43.423293 543705 net.go:698] Add success.
I0320 14:51:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:51:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:51:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:51:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:51:53.409783 543705 cpu.go:275] no items to output this cycle
I0320 14:51:53.409786 543705 memory.go:184] no items to output this cycle
E0320 14:52:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:52:03.409768 543705 memory.go:184] no items to output this cycle
I0320 14:52:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 14:52:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:52:13.409806 543705 memory.go:191] Add success.
I0320 14:52:13.409807 543705 cpu.go:282] Add success.
W0320 14:52:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:52:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:52:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:52:13.420058 543705 net.go:648] Add success.
I0320 14:52:13.422832 543705 net.go:770] primary dev: ETH0
I0320 14:52:13.422845 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:52:13.422858 543705 net.go:698] Add success.
W0320 14:52:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:52:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 14:52:14.455182 543705 disk_worker.go:728] disk inode is not compliant
E0320 14:52:14.455921 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 14:52:14.455930 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 14:52:14.455936 543705 custom_config.go:64] query custom config with name: gpu
I0320 14:52:14.456843 543705 disk_worker.go:494] system disk:vda1
I0320 14:52:14.456880 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 14:52:15.456822 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 14:52:15.456832 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:52:16.457922 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 14:52:16.457922 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 14:52:16.457977 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:52:16.457997 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:52:16.472323 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:52:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:52:23.409774 543705 memory.go:184] no items to output this cycle
I0320 14:52:23.409779 543705 cpu.go:275] no items to output this cycle
I0320 14:52:24.776040 543705 disk_info.go:125] begin check local disk info of client
I0320 14:52:24.778480 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:52:24.778485 543705 disk_info.go:196] parse disk info done, disk is : [0xc000470040 0xc000470080]
E0320 14:52:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:52:33.409778 543705 memory.go:184] no items to output this cycle
I0320 14:52:33.409778 543705 cpu.go:275] no items to output this cycle
E0320 14:52:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:52:43.409797 543705 memory.go:191] Add success.
I0320 14:52:43.409801 543705 cpu.go:282] Add success.
I0320 14:52:43.419888 543705 net.go:648] Add success.
I0320 14:52:43.422841 543705 net.go:770] primary dev: ETH0
I0320 14:52:43.422855 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:52:43.422871 543705 net.go:698] Add success.
I0320 14:52:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:52:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:52:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:52:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:52:53.409808 543705 memory.go:184] no items to output this cycle
I0320 14:52:53.409817 543705 cpu.go:275] no items to output this cycle
E0320 14:53:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:53:03.409777 543705 memory.go:184] no items to output this cycle
I0320 14:53:03.409780 543705 cpu.go:275] no items to output this cycle
E0320 14:53:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:53:13.409786 543705 memory.go:191] Add success.
I0320 14:53:13.409786 543705 cpu.go:282] Add success.
W0320 14:53:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:53:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:53:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:53:13.420037 543705 net.go:648] Add success.
I0320 14:53:13.422841 543705 net.go:770] primary dev: ETH0
I0320 14:53:13.422854 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:53:13.422865 543705 net.go:698] Add success.
I0320 14:53:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:53:14.455098 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:53:14.455157 543705 disk_worker.go:708] disk space is not compliant
W0320 14:53:14.455160 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:53:14.456496 543705 disk_worker.go:494] system disk:vda1
I0320 14:53:14.456541 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:53:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:53:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:53:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:53:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:53:16.472373 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:53:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:53:23.409773 543705 memory.go:184] no items to output this cycle
I0320 14:53:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 14:53:24.779531 543705 disk_info.go:125] begin check local disk info of client
I0320 14:53:24.781978 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:53:24.781984 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004700c0 0xc000470100]
E0320 14:53:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:53:33.409773 543705 memory.go:184] no items to output this cycle
I0320 14:53:33.409793 543705 cpu.go:275] no items to output this cycle
E0320 14:53:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:53:43.409789 543705 memory.go:191] Add success.
I0320 14:53:43.409806 543705 cpu.go:282] Add success.
I0320 14:53:43.419982 543705 net.go:648] Add success.
I0320 14:53:43.422661 543705 net.go:770] primary dev: ETH0
I0320 14:53:43.422673 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:53:43.422702 543705 net.go:698] Add success.
I0320 14:53:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:53:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:53:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:53:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:53:53.409785 543705 memory.go:184] no items to output this cycle
I0320 14:53:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 14:54:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:54:03.409768 543705 memory.go:184] no items to output this cycle
I0320 14:54:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 14:54:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:54:13.409813 543705 memory.go:191] Add success.
I0320 14:54:13.409821 543705 cpu.go:282] Add success.
W0320 14:54:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:54:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:54:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:54:13.420067 543705 net.go:648] Add success.
I0320 14:54:13.422843 543705 net.go:770] primary dev: ETH0
I0320 14:54:13.422858 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:54:13.422874 543705 net.go:698] Add success.
I0320 14:54:13.578436 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c142ea5c-be98-4135-b1f4-cac3a1bfde9f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 14:54:13.578481 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 14:54:14.453975 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:54:14.454129 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:54:14.454192 543705 disk_worker.go:708] disk space is not compliant
W0320 14:54:14.454195 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:54:14.455530 543705 disk_worker.go:494] system disk:vda1
I0320 14:54:14.455575 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:54:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:54:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:54:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:54:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:54:16.472445 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:54:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:54:23.409766 543705 memory.go:184] no items to output this cycle
I0320 14:54:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 14:54:24.783546 543705 disk_info.go:125] begin check local disk info of client
I0320 14:54:24.785995 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:54:24.786000 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004633c0 0xc000463400]
E0320 14:54:33.410374 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:54:33.410389 543705 memory.go:184] no items to output this cycle
I0320 14:54:33.410424 543705 cpu.go:275] no items to output this cycle
I0320 14:54:38.519350 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 14:54:38.519358 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 14:54:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:54:43.410697 543705 memory.go:191] Add success.
I0320 14:54:43.409825 543705 cpu.go:282] Add success.
I0320 14:54:43.420479 543705 net.go:648] Add success.
I0320 14:54:43.423085 543705 net.go:770] primary dev: ETH0
I0320 14:54:43.423098 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:54:43.423111 543705 net.go:698] Add success.
I0320 14:54:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:54:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:54:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:54:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:54:53.409778 543705 memory.go:184] no items to output this cycle
I0320 14:54:53.409780 543705 cpu.go:275] no items to output this cycle
E0320 14:55:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:55:03.409793 543705 memory.go:184] no items to output this cycle
I0320 14:55:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 14:55:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:55:13.409781 543705 memory.go:191] Add success.
W0320 14:55:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:55:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:55:13.409818 543705 cpu.go:282] Add success.
I0320 14:55:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:55:13.420052 543705 net.go:648] Add success.
I0320 14:55:13.422862 543705 net.go:770] primary dev: ETH0
I0320 14:55:13.422877 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:55:13.422891 543705 net.go:698] Add success.
I0320 14:55:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:55:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:55:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 14:55:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:55:14.456572 543705 disk_worker.go:494] system disk:vda1
I0320 14:55:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:55:15.455949 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:55:16.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:55:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:55:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:55:16.472407 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:55:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:55:23.409771 543705 memory.go:184] no items to output this cycle
I0320 14:55:23.409787 543705 cpu.go:275] no items to output this cycle
I0320 14:55:24.787591 543705 disk_info.go:125] begin check local disk info of client
I0320 14:55:24.790037 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:55:24.790043 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004ae000 0xc0004ae040]
E0320 14:55:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:55:33.409773 543705 memory.go:184] no items to output this cycle
I0320 14:55:33.409784 543705 cpu.go:275] no items to output this cycle
E0320 14:55:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:55:43.409793 543705 memory.go:191] Add success.
I0320 14:55:43.409796 543705 cpu.go:282] Add success.
I0320 14:55:43.420302 543705 net.go:648] Add success.
I0320 14:55:43.423091 543705 net.go:770] primary dev: ETH0
I0320 14:55:43.423104 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:55:43.423117 543705 net.go:698] Add success.
I0320 14:55:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:55:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:55:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:55:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:55:53.409781 543705 memory.go:184] no items to output this cycle
I0320 14:55:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 14:56:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:56:03.409798 543705 memory.go:184] no items to output this cycle
I0320 14:56:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 14:56:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:56:13.409792 543705 memory.go:191] Add success.
I0320 14:56:13.409793 543705 cpu.go:282] Add success.
W0320 14:56:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:56:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:56:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:56:13.420061 543705 net.go:648] Add success.
I0320 14:56:13.423008 543705 net.go:770] primary dev: ETH0
I0320 14:56:13.423022 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:56:13.423033 543705 net.go:698] Add success.
I0320 14:56:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:56:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:56:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 14:56:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:56:14.456551 543705 disk_worker.go:494] system disk:vda1
I0320 14:56:14.456584 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:56:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:56:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:56:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:56:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:56:16.472091 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:56:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:56:23.409782 543705 memory.go:184] no items to output this cycle
I0320 14:56:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 14:56:24.790126 543705 disk_info.go:125] begin check local disk info of client
I0320 14:56:24.792549 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:56:24.792554 543705 disk_info.go:196] parse disk info done, disk is : [0xc000483140 0xc000483180]
E0320 14:56:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:56:33.409781 543705 memory.go:184] no items to output this cycle
I0320 14:56:33.409784 543705 cpu.go:275] no items to output this cycle
E0320 14:56:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:56:43.409794 543705 memory.go:191] Add success.
I0320 14:56:43.409814 543705 cpu.go:282] Add success.
I0320 14:56:43.419885 543705 net.go:648] Add success.
I0320 14:56:43.422933 543705 net.go:770] primary dev: ETH0
I0320 14:56:43.422964 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:56:43.422979 543705 net.go:698] Add success.
I0320 14:56:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:56:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:56:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:56:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:56:53.409771 543705 memory.go:184] no items to output this cycle
I0320 14:56:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 14:57:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:57:03.409774 543705 memory.go:184] no items to output this cycle
I0320 14:57:03.409806 543705 cpu.go:275] no items to output this cycle
E0320 14:57:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:57:13.409793 543705 memory.go:191] Add success.
I0320 14:57:13.409798 543705 cpu.go:282] Add success.
W0320 14:57:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:57:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:57:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:57:13.419894 543705 net.go:770] primary dev: ETH0
I0320 14:57:13.419909 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:57:13.419924 543705 net.go:698] Add success.
I0320 14:57:13.426707 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 14:57:13.426949 543705 net.go:648] Add success.
I0320 14:57:13.453352 543705 event_worker.go:152] Polling the log file for events...
I0320 14:57:13.469083 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f784f0cc-5e44-4530-b798-0db705f7ba73","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 14:57:13.469115 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 14:57:14.455166 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:57:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0320 14:57:14.455179 543705 disk_worker.go:728] disk inode is not compliant
E0320 14:57:14.456747 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 14:57:14.456756 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 14:57:14.456761 543705 custom_config.go:64] query custom config with name: gpu
I0320 14:57:14.456818 543705 disk_worker.go:494] system disk:vda1
I0320 14:57:14.456849 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 14:57:15.456880 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 14:57:15.456889 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:57:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 14:57:16.457973 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 14:57:16.458016 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:57:16.458032 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:57:16.472380 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:57:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:57:23.409806 543705 memory.go:184] no items to output this cycle
I0320 14:57:23.409818 543705 cpu.go:275] no items to output this cycle
I0320 14:57:24.793601 543705 disk_info.go:125] begin check local disk info of client
I0320 14:57:24.796035 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:57:24.796041 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004ae080 0xc0004ae0c0]
E0320 14:57:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:57:33.409782 543705 memory.go:184] no items to output this cycle
I0320 14:57:33.409786 543705 cpu.go:275] no items to output this cycle
I0320 14:57:38.520353 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 14:57:38.520361 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 14:57:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:57:43.410642 543705 memory.go:191] Add success.
I0320 14:57:43.409820 543705 cpu.go:282] Add success.
I0320 14:57:43.420346 543705 net.go:648] Add success.
I0320 14:57:43.422690 543705 net.go:770] primary dev: ETH0
I0320 14:57:43.422703 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:57:43.422727 543705 net.go:698] Add success.
I0320 14:57:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:57:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:57:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:57:53.410280 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:57:53.410301 543705 memory.go:184] no items to output this cycle
I0320 14:57:53.410316 543705 cpu.go:275] no items to output this cycle
E0320 14:58:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:58:03.409786 543705 memory.go:184] no items to output this cycle
I0320 14:58:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 14:58:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:58:13.409794 543705 memory.go:191] Add success.
I0320 14:58:13.409797 543705 cpu.go:282] Add success.
W0320 14:58:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:58:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:58:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:58:13.420126 543705 net.go:648] Add success.
I0320 14:58:13.423091 543705 net.go:770] primary dev: ETH0
I0320 14:58:13.423104 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:58:13.423116 543705 net.go:698] Add success.
I0320 14:58:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:58:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:58:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 14:58:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:58:14.456576 543705 disk_worker.go:494] system disk:vda1
I0320 14:58:14.456605 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:58:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:58:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:58:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:58:16.458049 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:58:16.472109 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:58:23.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:58:23.409819 543705 memory.go:184] no items to output this cycle
I0320 14:58:23.409831 543705 cpu.go:275] no items to output this cycle
I0320 14:58:24.797617 543705 disk_info.go:125] begin check local disk info of client
I0320 14:58:24.800055 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:58:24.800060 543705 disk_info.go:196] parse disk info done, disk is : [0xc000484440 0xc000484480]
E0320 14:58:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:58:33.409780 543705 memory.go:184] no items to output this cycle
I0320 14:58:33.409809 543705 cpu.go:275] no items to output this cycle
E0320 14:58:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:58:43.409800 543705 memory.go:191] Add success.
I0320 14:58:43.409822 543705 cpu.go:282] Add success.
I0320 14:58:43.419894 543705 net.go:648] Add success.
I0320 14:58:43.422736 543705 net.go:770] primary dev: ETH0
I0320 14:58:43.422749 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:58:43.422761 543705 net.go:698] Add success.
I0320 14:58:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:58:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:58:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:58:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:58:53.409772 543705 memory.go:184] no items to output this cycle
I0320 14:58:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 14:59:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:59:03.409777 543705 memory.go:184] no items to output this cycle
I0320 14:59:03.409780 543705 cpu.go:275] no items to output this cycle
E0320 14:59:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:59:13.409785 543705 memory.go:191] Add success.
I0320 14:59:13.409785 543705 cpu.go:282] Add success.
W0320 14:59:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 14:59:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 14:59:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 14:59:13.420120 543705 net.go:648] Add success.
I0320 14:59:13.422954 543705 net.go:770] primary dev: ETH0
I0320 14:59:13.422968 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:59:13.422983 543705 net.go:698] Add success.
I0320 14:59:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 14:59:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 14:59:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0320 14:59:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 14:59:14.456600 543705 disk_worker.go:494] system disk:vda1
I0320 14:59:14.456633 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 14:59:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 14:59:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:59:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:59:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 14:59:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0320 14:59:23.410260 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:59:23.410276 543705 memory.go:184] no items to output this cycle
I0320 14:59:23.410276 543705 cpu.go:275] no items to output this cycle
I0320 14:59:24.800139 543705 disk_info.go:125] begin check local disk info of client
I0320 14:59:24.802657 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 14:59:24.802662 543705 disk_info.go:196] parse disk info done, disk is : [0xc000376b80 0xc000376bc0]
E0320 14:59:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:59:33.409778 543705 memory.go:184] no items to output this cycle
I0320 14:59:33.409783 543705 cpu.go:275] no items to output this cycle
E0320 14:59:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:59:43.409783 543705 memory.go:191] Add success.
I0320 14:59:43.409813 543705 cpu.go:282] Add success.
I0320 14:59:43.420009 543705 net.go:648] Add success.
I0320 14:59:43.422871 543705 net.go:770] primary dev: ETH0
I0320 14:59:43.422885 543705 net.go:802] Send network stats successfully!,count is 6
I0320 14:59:43.422899 543705 net.go:698] Add success.
I0320 14:59:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 14:59:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 14:59:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 14:59:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 14:59:53.409775 543705 memory.go:184] no items to output this cycle
I0320 14:59:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 15:00:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:00:03.409774 543705 memory.go:184] no items to output this cycle
I0320 15:00:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 15:00:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:00:13.409786 543705 memory.go:191] Add success.
I0320 15:00:13.409786 543705 cpu.go:282] Add success.
W0320 15:00:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:00:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:00:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:00:13.420053 543705 net.go:648] Add success.
I0320 15:00:13.422566 543705 net.go:770] primary dev: ETH0
I0320 15:00:13.422578 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:00:13.422589 543705 net.go:698] Add success.
I0320 15:00:13.468487 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ada63591-86cf-4b9c-ab34-ebe15726cc21","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 15:00:13.468526 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 15:00:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:00:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:00:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 15:00:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:00:14.456578 543705 disk_worker.go:494] system disk:vda1
I0320 15:00:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:00:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:00:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:00:16.458024 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:00:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:00:16.472370 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:00:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:00:23.409887 543705 memory.go:184] no items to output this cycle
I0320 15:00:23.409918 543705 cpu.go:275] no items to output this cycle
I0320 15:00:24.804652 543705 disk_info.go:125] begin check local disk info of client
I0320 15:00:24.807112 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:00:24.807120 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004ae000 0xc0004ae040]
I0320 15:00:33.409782 543705 cpu.go:275] no items to output this cycle
E0320 15:00:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:00:33.409808 543705 memory.go:184] no items to output this cycle
I0320 15:00:38.521369 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 15:00:38.521377 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 15:00:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:00:43.410578 543705 memory.go:191] Add success.
I0320 15:00:43.409818 543705 cpu.go:282] Add success.
I0320 15:00:43.420273 543705 net.go:648] Add success.
I0320 15:00:43.423025 543705 net.go:770] primary dev: ETH0
I0320 15:00:43.423038 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:00:43.423050 543705 net.go:698] Add success.
I0320 15:00:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:00:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:00:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:00:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:00:53.409785 543705 memory.go:184] no items to output this cycle
I0320 15:00:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 15:01:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:01:03.409780 543705 memory.go:184] no items to output this cycle
I0320 15:01:03.409782 543705 cpu.go:275] no items to output this cycle
E0320 15:01:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:01:13.409783 543705 memory.go:191] Add success.
I0320 15:01:13.409786 543705 cpu.go:282] Add success.
W0320 15:01:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:01:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:01:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:01:13.420204 543705 net.go:648] Add success.
I0320 15:01:13.423136 543705 net.go:770] primary dev: ETH0
I0320 15:01:13.423154 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:01:13.423167 543705 net.go:698] Add success.
I0320 15:01:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:01:14.455214 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:01:14.455225 543705 disk_worker.go:708] disk space is not compliant
W0320 15:01:14.455228 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:01:14.456623 543705 disk_worker.go:494] system disk:vda1
I0320 15:01:14.456655 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:01:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:01:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:01:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:01:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:01:16.472377 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:01:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:01:23.409791 543705 memory.go:184] no items to output this cycle
I0320 15:01:23.409802 543705 cpu.go:275] no items to output this cycle
I0320 15:01:24.808665 543705 disk_info.go:125] begin check local disk info of client
I0320 15:01:24.811099 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:01:24.811106 543705 disk_info.go:196] parse disk info done, disk is : [0xc000331180 0xc0003311c0]
E0320 15:01:33.409842 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:01:33.409864 543705 memory.go:184] no items to output this cycle
I0320 15:01:33.410026 543705 cpu.go:275] no items to output this cycle
E0320 15:01:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:01:43.409795 543705 memory.go:191] Add success.
I0320 15:01:43.409814 543705 cpu.go:282] Add success.
I0320 15:01:43.419978 543705 net.go:648] Add success.
I0320 15:01:43.422849 543705 net.go:770] primary dev: ETH0
I0320 15:01:43.422864 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:01:43.422884 543705 net.go:698] Add success.
I0320 15:01:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:01:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:01:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:01:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:01:53.409788 543705 memory.go:184] no items to output this cycle
I0320 15:01:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 15:02:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:02:03.409803 543705 memory.go:184] no items to output this cycle
I0320 15:02:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 15:02:13.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:02:13.409777 543705 memory.go:191] Add success.
W0320 15:02:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 15:02:13.409816 543705 cpu.go:282] Add success.
W0320 15:02:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:02:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:02:13.420317 543705 net.go:648] Add success.
I0320 15:02:13.423149 543705 net.go:770] primary dev: ETH0
I0320 15:02:13.423166 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:02:13.423178 543705 net.go:698] Add success.
W0320 15:02:14.454283 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:02:14.454296 543705 disk_worker.go:708] disk space is not compliant
W0320 15:02:14.454300 543705 disk_worker.go:728] disk inode is not compliant
E0320 15:02:14.454922 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 15:02:14.454930 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 15:02:14.454936 543705 custom_config.go:64] query custom config with name: gpu
I0320 15:02:14.455864 543705 disk_worker.go:494] system disk:vda1
I0320 15:02:14.455908 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 15:02:15.456793 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 15:02:15.456802 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:02:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 15:02:16.457982 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 15:02:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:02:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:02:16.472386 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:02:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:02:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 15:02:23.409791 543705 memory.go:184] no items to output this cycle
I0320 15:02:24.812745 543705 disk_info.go:125] begin check local disk info of client
I0320 15:02:24.815291 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:02:24.815297 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032b080 0xc00032b0c0]
E0320 15:02:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:02:33.409780 543705 memory.go:184] no items to output this cycle
I0320 15:02:33.409805 543705 cpu.go:275] no items to output this cycle
E0320 15:02:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:02:43.409823 543705 memory.go:191] Add success.
I0320 15:02:43.409835 543705 cpu.go:282] Add success.
I0320 15:02:43.420022 543705 net.go:648] Add success.
I0320 15:02:43.422900 543705 net.go:770] primary dev: ETH0
I0320 15:02:43.422914 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:02:43.422925 543705 net.go:698] Add success.
I0320 15:02:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:02:46.458062 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:02:46.458088 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:02:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:02:53.409800 543705 memory.go:184] no items to output this cycle
I0320 15:02:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 15:03:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:03:03.409780 543705 memory.go:184] no items to output this cycle
I0320 15:03:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 15:03:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:03:13.409785 543705 memory.go:191] Add success.
I0320 15:03:13.409802 543705 cpu.go:282] Add success.
W0320 15:03:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:03:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:03:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:03:13.420260 543705 net.go:648] Add success.
I0320 15:03:13.422893 543705 net.go:770] primary dev: ETH0
I0320 15:03:13.422906 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:03:13.422918 543705 net.go:698] Add success.
I0320 15:03:13.468924 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"634a3010-b6e9-4e8b-afcf-b6f9be5b7f97","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 15:03:13.468958 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 15:03:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:03:14.455185 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:03:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0320 15:03:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:03:14.456602 543705 disk_worker.go:494] system disk:vda1
I0320 15:03:14.456632 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:03:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:03:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:03:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:03:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:03:16.472412 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:03:23.409883 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:03:23.409899 543705 memory.go:184] no items to output this cycle
I0320 15:03:23.409903 543705 cpu.go:275] no items to output this cycle
I0320 15:03:24.815378 543705 disk_info.go:125] begin check local disk info of client
I0320 15:03:24.817874 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:03:24.817879 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4100 0xc0000c4140]
E0320 15:03:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:03:33.409780 543705 memory.go:184] no items to output this cycle
I0320 15:03:33.409785 543705 cpu.go:275] no items to output this cycle
I0320 15:03:38.522367 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 15:03:38.522375 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 15:03:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:03:43.410770 543705 memory.go:191] Add success.
I0320 15:03:43.409823 543705 cpu.go:282] Add success.
I0320 15:03:43.420548 543705 net.go:648] Add success.
I0320 15:03:43.423508 543705 net.go:770] primary dev: ETH0
I0320 15:03:43.423521 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:03:43.423533 543705 net.go:698] Add success.
I0320 15:03:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:03:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:03:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:03:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:03:53.409787 543705 memory.go:184] no items to output this cycle
I0320 15:03:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 15:04:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:04:03.409778 543705 memory.go:184] no items to output this cycle
I0320 15:04:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 15:04:13.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:04:13.409778 543705 memory.go:191] Add success.
W0320 15:04:13.409803 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 15:04:13.409810 543705 cpu.go:282] Add success.
W0320 15:04:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:04:13.409818 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:04:13.420080 543705 net.go:770] primary dev: ETH0
I0320 15:04:13.420096 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:04:13.420111 543705 net.go:698] Add success.
I0320 15:04:13.420468 543705 net.go:648] Add success.
I0320 15:04:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:04:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:04:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 15:04:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:04:14.456573 543705 disk_worker.go:494] system disk:vda1
I0320 15:04:14.456603 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:04:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:04:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:04:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:04:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:04:16.472401 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:04:23.409862 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:04:23.409880 543705 memory.go:184] no items to output this cycle
I0320 15:04:23.409982 543705 cpu.go:275] no items to output this cycle
I0320 15:04:24.819740 543705 disk_info.go:125] begin check local disk info of client
I0320 15:04:24.822223 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:04:24.822229 543705 disk_info.go:196] parse disk info done, disk is : [0xc000472180 0xc0004721c0]
E0320 15:04:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:04:33.409773 543705 memory.go:184] no items to output this cycle
I0320 15:04:33.409809 543705 cpu.go:275] no items to output this cycle
E0320 15:04:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:04:43.409827 543705 memory.go:191] Add success.
I0320 15:04:43.409838 543705 cpu.go:282] Add success.
I0320 15:04:43.420059 543705 net.go:648] Add success.
I0320 15:04:43.422431 543705 net.go:770] primary dev: ETH0
I0320 15:04:43.422444 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:04:43.422458 543705 net.go:698] Add success.
I0320 15:04:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:04:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:04:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:04:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:04:53.409780 543705 cpu.go:275] no items to output this cycle
I0320 15:04:53.409789 543705 memory.go:184] no items to output this cycle
E0320 15:05:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:05:03.409778 543705 memory.go:184] no items to output this cycle
I0320 15:05:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 15:05:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:05:13.409790 543705 memory.go:191] Add success.
I0320 15:05:13.409794 543705 cpu.go:282] Add success.
W0320 15:05:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:05:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:05:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:05:13.420046 543705 net.go:648] Add success.
I0320 15:05:13.423218 543705 net.go:770] primary dev: ETH0
I0320 15:05:13.423239 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:05:13.423251 543705 net.go:698] Add success.
I0320 15:05:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:05:14.455124 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:05:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0320 15:05:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:05:14.456580 543705 disk_worker.go:494] system disk:vda1
I0320 15:05:14.456610 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:05:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:05:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:05:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:05:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:05:16.472382 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:05:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:05:23.409773 543705 memory.go:184] no items to output this cycle
I0320 15:05:23.409776 543705 cpu.go:275] no items to output this cycle
I0320 15:05:24.823737 543705 disk_info.go:125] begin check local disk info of client
I0320 15:05:24.826176 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:05:24.826182 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e8cc0 0xc0003e8d00]
E0320 15:05:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:05:33.409776 543705 memory.go:184] no items to output this cycle
I0320 15:05:33.409794 543705 cpu.go:275] no items to output this cycle
E0320 15:05:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:05:43.409822 543705 memory.go:191] Add success.
I0320 15:05:43.409823 543705 cpu.go:282] Add success.
I0320 15:05:43.419955 543705 net.go:648] Add success.
I0320 15:05:43.422772 543705 net.go:770] primary dev: ETH0
I0320 15:05:43.422785 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:05:43.422798 543705 net.go:698] Add success.
I0320 15:05:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:05:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:05:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:05:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:05:53.409777 543705 memory.go:184] no items to output this cycle
I0320 15:05:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 15:06:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:06:03.409793 543705 memory.go:184] no items to output this cycle
I0320 15:06:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 15:06:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:06:13.409784 543705 memory.go:191] Add success.
I0320 15:06:13.409806 543705 cpu.go:282] Add success.
W0320 15:06:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:06:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:06:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:06:13.420060 543705 net.go:648] Add success.
I0320 15:06:13.422782 543705 net.go:770] primary dev: ETH0
I0320 15:06:13.422796 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:06:13.422810 543705 net.go:698] Add success.
I0320 15:06:13.504405 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f95c362a-fcff-4bcb-97a7-4e2aebabdc66","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 15:06:13.504440 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 15:06:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:06:14.455188 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:06:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 15:06:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:06:14.456855 543705 disk_worker.go:494] system disk:vda1
I0320 15:06:14.456884 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:06:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:06:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:06:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:06:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:06:16.472417 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:06:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:06:23.409769 543705 memory.go:184] no items to output this cycle
I0320 15:06:23.409801 543705 cpu.go:275] no items to output this cycle
I0320 15:06:24.827763 543705 disk_info.go:125] begin check local disk info of client
I0320 15:06:24.830190 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:06:24.830196 543705 disk_info.go:196] parse disk info done, disk is : [0xc00025b180 0xc00025b1c0]
E0320 15:06:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:06:33.409807 543705 memory.go:184] no items to output this cycle
I0320 15:06:33.409821 543705 cpu.go:275] no items to output this cycle
I0320 15:06:38.523383 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 15:06:38.523392 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 15:06:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:06:43.410543 543705 memory.go:191] Add success.
I0320 15:06:43.409838 543705 cpu.go:282] Add success.
I0320 15:06:43.420235 543705 net.go:648] Add success.
I0320 15:06:43.422926 543705 net.go:770] primary dev: ETH0
I0320 15:06:43.422941 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:06:43.422957 543705 net.go:698] Add success.
I0320 15:06:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:06:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:06:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:06:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:06:53.409775 543705 memory.go:184] no items to output this cycle
I0320 15:06:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 15:07:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:07:03.409780 543705 cpu.go:275] no items to output this cycle
I0320 15:07:03.409783 543705 memory.go:184] no items to output this cycle
W0320 15:07:13.409711 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:07:13.409733 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:07:13.409739 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0320 15:07:13.409830 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:07:13.409830 543705 cpu.go:282] Add success.
I0320 15:07:13.409847 543705 memory.go:191] Add success.
I0320 15:07:13.420390 543705 net.go:648] Add success.
I0320 15:07:13.423248 543705 net.go:770] primary dev: ETH0
I0320 15:07:13.423262 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:07:13.423273 543705 net.go:698] Add success.
I0320 15:07:13.452770 543705 event_worker.go:152] Polling the log file for events...
W0320 15:07:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:07:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 15:07:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:07:14.457067 543705 disk_worker.go:494] system disk:vda1
I0320 15:07:14.457092 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 15:07:14.457229 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 15:07:14.457234 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 15:07:14.457238 543705 custom_config.go:64] query custom config with name: gpu
E0320 15:07:15.456823 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 15:07:15.456831 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:07:16.457893 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 15:07:16.457894 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 15:07:16.457947 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:07:16.457965 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:07:16.472275 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:07:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:07:23.409801 543705 memory.go:184] no items to output this cycle
I0320 15:07:23.409816 543705 cpu.go:275] no items to output this cycle
I0320 15:07:24.830277 543705 disk_info.go:125] begin check local disk info of client
I0320 15:07:24.832758 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:07:24.832764 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004ae900 0xc0004ae940]
E0320 15:07:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:07:33.409782 543705 memory.go:184] no items to output this cycle
I0320 15:07:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 15:07:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:07:43.409788 543705 memory.go:191] Add success.
I0320 15:07:43.409821 543705 cpu.go:282] Add success.
I0320 15:07:43.419869 543705 net.go:648] Add success.
I0320 15:07:43.422623 543705 net.go:770] primary dev: ETH0
I0320 15:07:43.422636 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:07:43.422648 543705 net.go:698] Add success.
I0320 15:07:46.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:07:46.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:07:46.458056 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:07:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:07:53.409809 543705 memory.go:184] no items to output this cycle
I0320 15:07:53.409817 543705 cpu.go:275] no items to output this cycle
E0320 15:08:03.409831 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:08:03.409850 543705 memory.go:184] no items to output this cycle
I0320 15:08:03.409909 543705 cpu.go:275] no items to output this cycle
E0320 15:08:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:08:13.409778 543705 memory.go:191] Add success.
W0320 15:08:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 15:08:13.409811 543705 cpu.go:282] Add success.
W0320 15:08:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:08:13.409818 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:08:13.420209 543705 net.go:648] Add success.
I0320 15:08:13.423218 543705 net.go:770] primary dev: ETH0
I0320 15:08:13.423237 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:08:13.423251 543705 net.go:698] Add success.
I0320 15:08:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:08:14.455134 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:08:14.455217 543705 disk_worker.go:708] disk space is not compliant
W0320 15:08:14.455220 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:08:14.456820 543705 disk_worker.go:494] system disk:vda1
I0320 15:08:14.456848 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:08:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:08:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:08:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:08:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:08:16.472432 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:08:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:08:23.409768 543705 memory.go:184] no items to output this cycle
I0320 15:08:23.409801 543705 cpu.go:275] no items to output this cycle
I0320 15:08:24.833672 543705 disk_info.go:125] begin check local disk info of client
I0320 15:08:24.836087 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:08:24.836093 543705 disk_info.go:196] parse disk info done, disk is : [0xc000509bc0 0xc000509c00]
E0320 15:08:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:08:33.409806 543705 memory.go:184] no items to output this cycle
I0320 15:08:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 15:08:43.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:08:43.409829 543705 memory.go:191] Add success.
I0320 15:08:43.409830 543705 cpu.go:282] Add success.
I0320 15:08:43.419969 543705 net.go:648] Add success.
I0320 15:08:43.423096 543705 net.go:770] primary dev: ETH0
I0320 15:08:43.423109 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:08:43.423121 543705 net.go:698] Add success.
I0320 15:08:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:08:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:08:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:08:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:08:53.409804 543705 memory.go:184] no items to output this cycle
I0320 15:08:53.409814 543705 cpu.go:275] no items to output this cycle
E0320 15:09:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:09:03.409795 543705 memory.go:184] no items to output this cycle
I0320 15:09:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 15:09:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:09:13.409796 543705 memory.go:191] Add success.
W0320 15:09:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 15:09:13.409826 543705 cpu.go:282] Add success.
W0320 15:09:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:09:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:09:13.420301 543705 net.go:648] Add success.
I0320 15:09:13.423146 543705 net.go:770] primary dev: ETH0
I0320 15:09:13.423160 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:09:13.423171 543705 net.go:698] Add success.
I0320 15:09:13.588163 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1f01033e-3df3-43e8-bb70-3f3dc967a5dd","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 15:09:13.588203 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 15:09:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:09:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:09:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 15:09:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:09:14.456720 543705 disk_worker.go:494] system disk:vda1
I0320 15:09:14.456751 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:09:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:09:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:09:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:09:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:09:16.472374 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:09:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:09:23.409765 543705 memory.go:184] no items to output this cycle
I0320 15:09:23.409794 543705 cpu.go:275] no items to output this cycle
I0320 15:09:24.837673 543705 disk_info.go:125] begin check local disk info of client
I0320 15:09:24.840158 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:09:24.840165 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003c0b40 0xc0003c0b80]
E0320 15:09:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:09:33.409785 543705 memory.go:184] no items to output this cycle
I0320 15:09:33.409787 543705 cpu.go:275] no items to output this cycle
I0320 15:09:38.524388 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 15:09:38.524395 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 15:09:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:09:43.410674 543705 memory.go:191] Add success.
I0320 15:09:43.409819 543705 cpu.go:282] Add success.
I0320 15:09:43.420384 543705 net.go:648] Add success.
I0320 15:09:43.422958 543705 net.go:770] primary dev: ETH0
I0320 15:09:43.422971 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:09:43.422984 543705 net.go:698] Add success.
I0320 15:09:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:09:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:09:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:09:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:09:53.409785 543705 memory.go:184] no items to output this cycle
I0320 15:09:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 15:10:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:10:03.409800 543705 memory.go:184] no items to output this cycle
I0320 15:10:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 15:10:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:10:13.409790 543705 memory.go:191] Add success.
I0320 15:10:13.409791 543705 cpu.go:282] Add success.
W0320 15:10:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:10:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:10:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:10:13.420255 543705 net.go:648] Add success.
I0320 15:10:13.422978 543705 net.go:770] primary dev: ETH0
I0320 15:10:13.422992 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:10:13.423004 543705 net.go:698] Add success.
I0320 15:10:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:10:14.455188 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:10:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 15:10:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:10:14.456579 543705 disk_worker.go:494] system disk:vda1
I0320 15:10:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:10:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:10:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:10:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:10:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:10:16.472367 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:10:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:10:23.409763 543705 memory.go:184] no items to output this cycle
I0320 15:10:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 15:10:24.841678 543705 disk_info.go:125] begin check local disk info of client
I0320 15:10:24.844083 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:10:24.844090 543705 disk_info.go:196] parse disk info done, disk is : [0xc000370d80 0xc000370dc0]
E0320 15:10:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:10:33.409812 543705 memory.go:184] no items to output this cycle
I0320 15:10:33.409820 543705 cpu.go:275] no items to output this cycle
E0320 15:10:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:10:43.409803 543705 memory.go:191] Add success.
I0320 15:10:43.409804 543705 cpu.go:282] Add success.
I0320 15:10:43.420000 543705 net.go:648] Add success.
I0320 15:10:43.422759 543705 net.go:770] primary dev: ETH0
I0320 15:10:43.422774 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:10:43.422788 543705 net.go:698] Add success.
I0320 15:10:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:10:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:10:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:10:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:10:53.409781 543705 memory.go:184] no items to output this cycle
I0320 15:10:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 15:11:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:11:03.409776 543705 memory.go:184] no items to output this cycle
I0320 15:11:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 15:11:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:11:13.409782 543705 memory.go:191] Add success.
I0320 15:11:13.409803 543705 cpu.go:282] Add success.
W0320 15:11:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:11:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:11:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:11:13.420364 543705 net.go:648] Add success.
I0320 15:11:13.423268 543705 net.go:770] primary dev: ETH0
I0320 15:11:13.423281 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:11:13.423293 543705 net.go:698] Add success.
I0320 15:11:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:11:14.455187 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:11:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 15:11:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:11:14.456552 543705 disk_worker.go:494] system disk:vda1
I0320 15:11:14.456582 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:11:15.455950 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:11:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:11:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:11:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:11:16.472404 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:11:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:11:23.409788 543705 memory.go:184] no items to output this cycle
I0320 15:11:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 15:11:24.845674 543705 disk_info.go:125] begin check local disk info of client
I0320 15:11:24.848162 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:11:24.848168 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007ad80 0xc00007adc0]
E0320 15:11:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:11:33.409796 543705 memory.go:184] no items to output this cycle
I0320 15:11:33.409809 543705 cpu.go:275] no items to output this cycle
E0320 15:11:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:11:43.409792 543705 memory.go:191] Add success.
I0320 15:11:43.409812 543705 cpu.go:282] Add success.
I0320 15:11:43.420062 543705 net.go:648] Add success.
I0320 15:11:43.423390 543705 net.go:770] primary dev: ETH0
I0320 15:11:43.423402 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:11:43.423414 543705 net.go:698] Add success.
I0320 15:11:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:11:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:11:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:11:53.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:11:53.409819 543705 memory.go:184] no items to output this cycle
I0320 15:11:53.409829 543705 cpu.go:275] no items to output this cycle
E0320 15:12:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:12:03.409779 543705 memory.go:184] no items to output this cycle
I0320 15:12:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 15:12:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:12:13.409825 543705 memory.go:191] Add success.
I0320 15:12:13.409828 543705 cpu.go:282] Add success.
W0320 15:12:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:12:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:12:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:12:13.420236 543705 net.go:648] Add success.
I0320 15:12:13.422903 543705 net.go:770] primary dev: ETH0
I0320 15:12:13.422915 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:12:13.422927 543705 net.go:698] Add success.
I0320 15:12:13.474027 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"75841e96-7fb5-45ea-9983-55107fc01553","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 15:12:13.474061 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 15:12:14.455373 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:12:14.455394 543705 disk_worker.go:708] disk space is not compliant
W0320 15:12:14.455399 543705 disk_worker.go:728] disk inode is not compliant
E0320 15:12:14.456639 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 15:12:14.456649 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 15:12:14.456670 543705 custom_config.go:64] query custom config with name: gpu
I0320 15:12:14.456929 543705 disk_worker.go:494] system disk:vda1
I0320 15:12:14.456965 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 15:12:15.456824 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 15:12:15.456832 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:12:16.457914 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 15:12:16.457914 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 15:12:16.457970 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:12:16.457990 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:12:16.472307 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:12:23.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:12:23.409821 543705 memory.go:184] no items to output this cycle
I0320 15:12:23.409824 543705 cpu.go:275] no items to output this cycle
I0320 15:12:24.849680 543705 disk_info.go:125] begin check local disk info of client
I0320 15:12:24.852217 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:12:24.852224 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5180 0xc0000c51c0]
E0320 15:12:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:12:33.409777 543705 memory.go:184] no items to output this cycle
I0320 15:12:33.409801 543705 cpu.go:275] no items to output this cycle
I0320 15:12:38.525375 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 15:12:38.525383 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 15:12:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:12:43.410648 543705 memory.go:191] Add success.
I0320 15:12:43.409832 543705 cpu.go:282] Add success.
I0320 15:12:43.420358 543705 net.go:648] Add success.
I0320 15:12:43.422955 543705 net.go:770] primary dev: ETH0
I0320 15:12:43.422969 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:12:43.422984 543705 net.go:698] Add success.
I0320 15:12:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:12:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:12:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:12:53.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:12:53.409813 543705 memory.go:184] no items to output this cycle
I0320 15:12:53.409825 543705 cpu.go:275] no items to output this cycle
E0320 15:13:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:13:03.409780 543705 memory.go:184] no items to output this cycle
I0320 15:13:03.409824 543705 cpu.go:275] no items to output this cycle
E0320 15:13:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:13:13.409802 543705 memory.go:191] Add success.
I0320 15:13:13.409802 543705 cpu.go:282] Add success.
W0320 15:13:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:13:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:13:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:13:13.420439 543705 net.go:648] Add success.
I0320 15:13:13.423081 543705 net.go:770] primary dev: ETH0
I0320 15:13:13.423095 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:13:13.423107 543705 net.go:698] Add success.
I0320 15:13:14.454955 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:13:14.455128 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:13:14.455138 543705 disk_worker.go:708] disk space is not compliant
W0320 15:13:14.455141 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:13:14.456457 543705 disk_worker.go:494] system disk:vda1
I0320 15:13:14.456498 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:13:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:13:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:13:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:13:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:13:16.472393 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:13:23.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:13:23.409811 543705 memory.go:184] no items to output this cycle
I0320 15:13:23.409821 543705 cpu.go:275] no items to output this cycle
I0320 15:13:24.853676 543705 disk_info.go:125] begin check local disk info of client
I0320 15:13:24.856208 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:13:24.856214 543705 disk_info.go:196] parse disk info done, disk is : [0xc000256fc0 0xc000257000]
E0320 15:13:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:13:33.409808 543705 memory.go:184] no items to output this cycle
I0320 15:13:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 15:13:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:13:43.409804 543705 memory.go:191] Add success.
I0320 15:13:43.409821 543705 cpu.go:282] Add success.
I0320 15:13:43.419993 543705 net.go:648] Add success.
I0320 15:13:43.422835 543705 net.go:770] primary dev: ETH0
I0320 15:13:43.422850 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:13:43.422864 543705 net.go:698] Add success.
I0320 15:13:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:13:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:13:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:13:53.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:13:53.409811 543705 memory.go:184] no items to output this cycle
I0320 15:13:53.409825 543705 cpu.go:275] no items to output this cycle
E0320 15:14:03.409883 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:14:03.409902 543705 memory.go:184] no items to output this cycle
I0320 15:14:03.409958 543705 cpu.go:275] no items to output this cycle
E0320 15:14:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:14:13.409804 543705 memory.go:191] Add success.
I0320 15:14:13.409806 543705 cpu.go:282] Add success.
W0320 15:14:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:14:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:14:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:14:13.420158 543705 net.go:648] Add success.
I0320 15:14:13.423026 543705 net.go:770] primary dev: ETH0
I0320 15:14:13.423039 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:14:13.423051 543705 net.go:698] Add success.
I0320 15:14:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:14:14.455147 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:14:14.455156 543705 disk_worker.go:708] disk space is not compliant
W0320 15:14:14.455159 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:14:14.456488 543705 disk_worker.go:494] system disk:vda1
I0320 15:14:14.456530 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:14:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:14:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:14:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:14:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:14:16.472381 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:14:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:14:23.409788 543705 memory.go:184] no items to output this cycle
I0320 15:14:23.409805 543705 cpu.go:275] no items to output this cycle
I0320 15:14:24.857673 543705 disk_info.go:125] begin check local disk info of client
I0320 15:14:24.860181 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:14:24.860187 543705 disk_info.go:196] parse disk info done, disk is : [0xc000587c40 0xc000587c80]
E0320 15:14:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:14:33.409808 543705 memory.go:184] no items to output this cycle
I0320 15:14:33.409823 543705 cpu.go:275] no items to output this cycle
E0320 15:14:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:14:43.409816 543705 memory.go:191] Add success.
I0320 15:14:43.409823 543705 cpu.go:282] Add success.
I0320 15:14:43.419977 543705 net.go:648] Add success.
I0320 15:14:43.423004 543705 net.go:770] primary dev: ETH0
I0320 15:14:43.423019 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:14:43.423035 543705 net.go:698] Add success.
I0320 15:14:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:14:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:14:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:14:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:14:53.409777 543705 memory.go:184] no items to output this cycle
I0320 15:14:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 15:15:03.409890 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:15:03.409910 543705 memory.go:184] no items to output this cycle
I0320 15:15:03.409963 543705 cpu.go:275] no items to output this cycle
E0320 15:15:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:15:13.409784 543705 memory.go:191] Add success.
W0320 15:15:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 15:15:13.409818 543705 cpu.go:282] Add success.
W0320 15:15:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:15:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:15:13.420255 543705 net.go:648] Add success.
I0320 15:15:13.423266 543705 net.go:770] primary dev: ETH0
I0320 15:15:13.423281 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:15:13.423295 543705 net.go:698] Add success.
I0320 15:15:13.527951 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d70f2f5f-ca49-481a-8808-7e8e0eba0f31","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 15:15:13.527986 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 15:15:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:15:14.455200 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:15:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0320 15:15:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:15:14.456579 543705 disk_worker.go:494] system disk:vda1
I0320 15:15:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:15:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:15:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:15:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:15:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:15:16.472367 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:15:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:15:23.409804 543705 memory.go:184] no items to output this cycle
I0320 15:15:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 15:15:24.861674 543705 disk_info.go:125] begin check local disk info of client
I0320 15:15:24.864217 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:15:24.864224 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa3c0 0xc0001aa400]
E0320 15:15:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:15:33.409772 543705 memory.go:184] no items to output this cycle
I0320 15:15:33.409793 543705 cpu.go:275] no items to output this cycle
I0320 15:15:38.526380 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 15:15:38.526388 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 15:15:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:15:43.410603 543705 memory.go:191] Add success.
I0320 15:15:43.409816 543705 cpu.go:282] Add success.
I0320 15:15:43.420358 543705 net.go:648] Add success.
I0320 15:15:43.422906 543705 net.go:770] primary dev: ETH0
I0320 15:15:43.422919 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:15:43.422933 543705 net.go:698] Add success.
I0320 15:15:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:15:46.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:15:46.458054 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:15:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:15:53.409774 543705 memory.go:184] no items to output this cycle
I0320 15:15:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 15:16:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:16:03.409779 543705 memory.go:184] no items to output this cycle
I0320 15:16:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 15:16:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:16:13.409787 543705 memory.go:191] Add success.
W0320 15:16:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:16:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:16:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:16:13.409841 543705 cpu.go:282] Add success.
I0320 15:16:13.420064 543705 net.go:648] Add success.
I0320 15:16:13.422836 543705 net.go:770] primary dev: ETH0
I0320 15:16:13.422850 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:16:13.422864 543705 net.go:698] Add success.
I0320 15:16:14.454980 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:16:14.455166 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:16:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0320 15:16:14.455180 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:16:14.456488 543705 disk_worker.go:494] system disk:vda1
I0320 15:16:14.456534 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:16:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:16:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:16:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:16:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:16:16.472386 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:16:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:16:23.409778 543705 memory.go:184] no items to output this cycle
I0320 15:16:23.409816 543705 cpu.go:275] no items to output this cycle
I0320 15:16:24.865673 543705 disk_info.go:125] begin check local disk info of client
I0320 15:16:24.868149 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:16:24.868154 543705 disk_info.go:196] parse disk info done, disk is : [0xc000586180 0xc0005861c0]
E0320 15:16:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:16:33.409772 543705 memory.go:184] no items to output this cycle
I0320 15:16:33.409797 543705 cpu.go:275] no items to output this cycle
E0320 15:16:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:16:43.409821 543705 memory.go:191] Add success.
I0320 15:16:43.409832 543705 cpu.go:282] Add success.
I0320 15:16:43.420257 543705 net.go:648] Add success.
I0320 15:16:43.423279 543705 net.go:770] primary dev: ETH0
I0320 15:16:43.423293 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:16:43.423305 543705 net.go:698] Add success.
I0320 15:16:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:16:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:16:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:16:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:16:53.409786 543705 memory.go:184] no items to output this cycle
I0320 15:16:53.409824 543705 cpu.go:275] no items to output this cycle
E0320 15:17:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:17:03.409785 543705 memory.go:184] no items to output this cycle
I0320 15:17:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 15:17:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:17:13.409793 543705 memory.go:191] Add success.
I0320 15:17:13.409800 543705 cpu.go:282] Add success.
W0320 15:17:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:17:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:17:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:17:13.420155 543705 net.go:648] Add success.
I0320 15:17:13.423302 543705 net.go:770] primary dev: ETH0
I0320 15:17:13.423317 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:17:13.423330 543705 net.go:698] Add success.
I0320 15:17:13.452804 543705 event_worker.go:152] Polling the log file for events...
W0320 15:17:14.455169 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:17:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 15:17:14.455182 543705 disk_worker.go:728] disk inode is not compliant
E0320 15:17:14.455885 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 15:17:14.455894 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 15:17:14.455900 543705 custom_config.go:64] query custom config with name: gpu
I0320 15:17:14.456530 543705 disk_worker.go:494] system disk:vda1
I0320 15:17:14.456561 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 15:17:15.456792 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 15:17:15.456800 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:17:16.457920 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 15:17:16.457920 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 15:17:16.457987 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:17:16.458009 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:17:16.472338 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:17:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:17:23.409770 543705 memory.go:184] no items to output this cycle
I0320 15:17:23.409788 543705 cpu.go:275] no items to output this cycle
I0320 15:17:24.869673 543705 disk_info.go:125] begin check local disk info of client
I0320 15:17:24.872152 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:17:24.872157 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb700 0xc0001fb740]
E0320 15:17:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:17:33.409797 543705 memory.go:184] no items to output this cycle
I0320 15:17:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 15:17:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:17:43.409808 543705 memory.go:191] Add success.
I0320 15:17:43.409827 543705 cpu.go:282] Add success.
I0320 15:17:43.419923 543705 net.go:648] Add success.
I0320 15:17:43.422524 543705 net.go:770] primary dev: ETH0
I0320 15:17:43.422539 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:17:43.422553 543705 net.go:698] Add success.
I0320 15:17:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:17:46.458062 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:17:46.458094 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:17:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:17:53.409773 543705 memory.go:184] no items to output this cycle
I0320 15:17:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 15:18:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:18:03.409791 543705 memory.go:184] no items to output this cycle
I0320 15:18:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 15:18:13.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:18:13.409830 543705 memory.go:191] Add success.
I0320 15:18:13.409842 543705 cpu.go:282] Add success.
W0320 15:18:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:18:13.409877 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:18:13.409881 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:18:13.420127 543705 net.go:648] Add success.
I0320 15:18:13.423039 543705 net.go:770] primary dev: ETH0
I0320 15:18:13.423052 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:18:13.423063 543705 net.go:698] Add success.
I0320 15:18:13.663302 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a88c284c-da0b-474a-a862-1ce56b8ecbe2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 15:18:13.663343 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 15:18:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:18:14.455091 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:18:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0320 15:18:14.455176 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:18:14.456661 543705 disk_worker.go:494] system disk:vda1
I0320 15:18:14.456691 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:18:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:18:16.457997 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:18:16.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:18:16.458076 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:18:16.473061 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:18:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:18:23.409796 543705 memory.go:184] no items to output this cycle
I0320 15:18:23.409799 543705 cpu.go:275] no items to output this cycle
I0320 15:18:24.873672 543705 disk_info.go:125] begin check local disk info of client
I0320 15:18:24.876155 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:18:24.876161 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa5c0 0xc0001fa600]
E0320 15:18:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:18:33.409776 543705 memory.go:184] no items to output this cycle
I0320 15:18:33.409810 543705 cpu.go:275] no items to output this cycle
I0320 15:18:38.527401 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 15:18:38.527409 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 15:18:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:18:43.410598 543705 memory.go:191] Add success.
I0320 15:18:43.409818 543705 cpu.go:282] Add success.
I0320 15:18:43.420303 543705 net.go:648] Add success.
I0320 15:18:43.423022 543705 net.go:770] primary dev: ETH0
I0320 15:18:43.423037 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:18:43.423054 543705 net.go:698] Add success.
I0320 15:18:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:18:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:18:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:18:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:18:53.409800 543705 memory.go:184] no items to output this cycle
I0320 15:18:53.409826 543705 cpu.go:275] no items to output this cycle
E0320 15:19:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:19:03.409778 543705 memory.go:184] no items to output this cycle
I0320 15:19:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 15:19:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:19:13.409792 543705 memory.go:191] Add success.
I0320 15:19:13.409803 543705 cpu.go:282] Add success.
W0320 15:19:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:19:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:19:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:19:13.420133 543705 net.go:648] Add success.
I0320 15:19:13.422785 543705 net.go:770] primary dev: ETH0
I0320 15:19:13.422798 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:19:13.422809 543705 net.go:698] Add success.
I0320 15:19:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:19:14.455162 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:19:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0320 15:19:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:19:14.456501 543705 disk_worker.go:494] system disk:vda1
I0320 15:19:14.456546 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:19:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:19:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:19:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:19:16.458079 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:19:16.472432 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:19:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:19:23.409769 543705 memory.go:184] no items to output this cycle
I0320 15:19:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 15:19:24.877672 543705 disk_info.go:125] begin check local disk info of client
I0320 15:19:24.880187 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:19:24.880193 543705 disk_info.go:196] parse disk info done, disk is : [0xc000586cc0 0xc000586d00]
E0320 15:19:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:19:33.409768 543705 memory.go:184] no items to output this cycle
I0320 15:19:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 15:19:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:19:43.409829 543705 memory.go:191] Add success.
I0320 15:19:43.409832 543705 cpu.go:282] Add success.
I0320 15:19:43.420006 543705 net.go:648] Add success.
I0320 15:19:43.423392 543705 net.go:770] primary dev: ETH0
I0320 15:19:43.423411 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:19:43.423426 543705 net.go:698] Add success.
I0320 15:19:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:19:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:19:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:19:53.410261 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:19:53.410281 543705 cpu.go:275] no items to output this cycle
I0320 15:19:53.410288 543705 memory.go:184] no items to output this cycle
E0320 15:20:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:20:03.409902 543705 memory.go:184] no items to output this cycle
I0320 15:20:03.409923 543705 cpu.go:275] no items to output this cycle
E0320 15:20:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:20:13.409822 543705 memory.go:191] Add success.
I0320 15:20:13.409832 543705 cpu.go:282] Add success.
W0320 15:20:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:20:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:20:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:20:13.420295 543705 net.go:648] Add success.
I0320 15:20:13.422983 543705 net.go:770] primary dev: ETH0
I0320 15:20:13.422995 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:20:13.423007 543705 net.go:698] Add success.
I0320 15:20:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:20:14.455160 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:20:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0320 15:20:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:20:14.456538 543705 disk_worker.go:494] system disk:vda1
I0320 15:20:14.456568 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:20:15.455978 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:20:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:20:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:20:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:20:16.472408 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:20:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:20:23.409770 543705 memory.go:184] no items to output this cycle
I0320 15:20:23.409801 543705 cpu.go:275] no items to output this cycle
I0320 15:20:24.881671 543705 disk_info.go:125] begin check local disk info of client
I0320 15:20:24.884172 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:20:24.884178 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa480 0xc0001fa4c0]
E0320 15:20:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:20:33.409779 543705 memory.go:184] no items to output this cycle
I0320 15:20:33.409785 543705 cpu.go:275] no items to output this cycle
E0320 15:20:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:20:43.409800 543705 memory.go:191] Add success.
I0320 15:20:43.409805 543705 cpu.go:282] Add success.
I0320 15:20:43.419985 543705 net.go:648] Add success.
I0320 15:20:43.423511 543705 net.go:770] primary dev: ETH0
I0320 15:20:43.423527 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:20:43.423542 543705 net.go:698] Add success.
I0320 15:20:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:20:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:20:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:20:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:20:53.409803 543705 memory.go:184] no items to output this cycle
I0320 15:20:53.409817 543705 cpu.go:275] no items to output this cycle
E0320 15:21:03.409867 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:21:03.409890 543705 memory.go:184] no items to output this cycle
I0320 15:21:03.409924 543705 cpu.go:275] no items to output this cycle
E0320 15:21:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:21:13.409818 543705 memory.go:191] Add success.
I0320 15:21:13.409827 543705 cpu.go:282] Add success.
W0320 15:21:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:21:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:21:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:21:13.420239 543705 net.go:648] Add success.
I0320 15:21:13.423041 543705 net.go:770] primary dev: ETH0
I0320 15:21:13.423056 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:21:13.423069 543705 net.go:698] Add success.
I0320 15:21:13.463392 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ffa63386-154c-43a0-b3bf-8f3d20b0c8d3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 15:21:13.463423 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 15:21:14.454985 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:21:14.455211 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:21:14.455225 543705 disk_worker.go:708] disk space is not compliant
W0320 15:21:14.455228 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:21:14.456654 543705 disk_worker.go:494] system disk:vda1
I0320 15:21:14.456688 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:21:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:21:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:21:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:21:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:21:16.472391 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:21:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:21:23.409801 543705 memory.go:184] no items to output this cycle
I0320 15:21:23.409813 543705 cpu.go:275] no items to output this cycle
I0320 15:21:24.885669 543705 disk_info.go:125] begin check local disk info of client
I0320 15:21:24.888098 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:21:24.888104 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb280 0xc0001fb2c0]
E0320 15:21:33.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:21:33.409762 543705 memory.go:184] no items to output this cycle
I0320 15:21:33.409806 543705 cpu.go:275] no items to output this cycle
I0320 15:21:38.528401 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 15:21:38.528408 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 15:21:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:21:43.410750 543705 memory.go:191] Add success.
I0320 15:21:43.409813 543705 cpu.go:282] Add success.
I0320 15:21:43.420230 543705 net.go:770] primary dev: ETH0
I0320 15:21:43.420246 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:21:43.420261 543705 net.go:698] Add success.
I0320 15:21:43.420611 543705 net.go:648] Add success.
I0320 15:21:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:21:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:21:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:21:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:21:53.409810 543705 memory.go:184] no items to output this cycle
I0320 15:21:53.409819 543705 cpu.go:275] no items to output this cycle
E0320 15:22:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:22:03.409786 543705 cpu.go:275] no items to output this cycle
I0320 15:22:03.409787 543705 memory.go:184] no items to output this cycle
E0320 15:22:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:22:13.409798 543705 cpu.go:282] Add success.
I0320 15:22:13.409803 543705 memory.go:191] Add success.
W0320 15:22:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:22:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:22:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:22:13.420076 543705 net.go:648] Add success.
I0320 15:22:13.422970 543705 net.go:770] primary dev: ETH0
I0320 15:22:13.422983 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:22:13.422996 543705 net.go:698] Add success.
W0320 15:22:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:22:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 15:22:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:22:14.456818 543705 disk_worker.go:494] system disk:vda1
I0320 15:22:14.456861 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 15:22:14.457134 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 15:22:14.457142 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 15:22:14.457147 543705 custom_config.go:64] query custom config with name: gpu
E0320 15:22:15.456834 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 15:22:15.456844 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:22:16.457914 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 15:22:16.457923 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 15:22:16.457965 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:22:16.457985 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:22:16.472320 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:22:23.410250 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:22:23.410265 543705 memory.go:184] no items to output this cycle
I0320 15:22:23.410267 543705 cpu.go:275] no items to output this cycle
I0320 15:22:24.889668 543705 disk_info.go:125] begin check local disk info of client
I0320 15:22:24.892102 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:22:24.892107 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb440 0xc0001fb480]
E0320 15:22:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:22:33.409777 543705 memory.go:184] no items to output this cycle
I0320 15:22:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 15:22:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:22:43.409821 543705 memory.go:191] Add success.
I0320 15:22:43.409825 543705 cpu.go:282] Add success.
I0320 15:22:43.420069 543705 net.go:648] Add success.
I0320 15:22:43.422724 543705 net.go:770] primary dev: ETH0
I0320 15:22:43.422738 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:22:43.422750 543705 net.go:698] Add success.
I0320 15:22:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:22:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:22:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:22:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:22:53.409802 543705 memory.go:184] no items to output this cycle
I0320 15:22:53.409898 543705 cpu.go:275] no items to output this cycle
E0320 15:23:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:23:03.409778 543705 memory.go:184] no items to output this cycle
I0320 15:23:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 15:23:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:23:13.409810 543705 memory.go:191] Add success.
I0320 15:23:13.409817 543705 cpu.go:282] Add success.
W0320 15:23:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:23:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:23:13.409860 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:23:13.420050 543705 net.go:648] Add success.
I0320 15:23:13.422979 543705 net.go:770] primary dev: ETH0
I0320 15:23:13.422992 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:23:13.423003 543705 net.go:698] Add success.
I0320 15:23:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:23:14.455199 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:23:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0320 15:23:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:23:14.456612 543705 disk_worker.go:494] system disk:vda1
I0320 15:23:14.456645 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:23:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:23:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:23:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:23:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:23:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:23:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:23:23.409766 543705 memory.go:184] no items to output this cycle
I0320 15:23:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 15:23:24.893673 543705 disk_info.go:125] begin check local disk info of client
I0320 15:23:24.896114 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:23:24.896120 543705 disk_info.go:196] parse disk info done, disk is : [0xc000340c80 0xc000340cc0]
E0320 15:23:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:23:33.409772 543705 memory.go:184] no items to output this cycle
I0320 15:23:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 15:23:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:23:43.409794 543705 memory.go:191] Add success.
I0320 15:23:43.409809 543705 cpu.go:282] Add success.
I0320 15:23:43.419897 543705 net.go:648] Add success.
I0320 15:23:43.422685 543705 net.go:770] primary dev: ETH0
I0320 15:23:43.422700 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:23:43.422715 543705 net.go:698] Add success.
I0320 15:23:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:23:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:23:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:23:53.409910 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:23:53.409929 543705 memory.go:184] no items to output this cycle
I0320 15:23:53.409931 543705 cpu.go:275] no items to output this cycle
E0320 15:24:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:24:03.409780 543705 memory.go:184] no items to output this cycle
I0320 15:24:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 15:24:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:24:13.409784 543705 memory.go:191] Add success.
W0320 15:24:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 15:24:13.409818 543705 cpu.go:282] Add success.
W0320 15:24:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:24:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:24:13.420293 543705 net.go:648] Add success.
I0320 15:24:13.422789 543705 net.go:770] primary dev: ETH0
I0320 15:24:13.422802 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:24:13.422814 543705 net.go:698] Add success.
I0320 15:24:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:24:14.455086 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:24:14.455149 543705 disk_worker.go:708] disk space is not compliant
W0320 15:24:14.455152 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:24:14.456486 543705 disk_worker.go:494] system disk:vda1
I0320 15:24:14.456529 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:24:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:24:15.494118 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9508922c-b18d-49cb-bdf5-a00ab90bfeb2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 15:24:15.494157 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 15:24:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:24:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:24:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:24:16.472444 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:24:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:24:23.409783 543705 memory.go:184] no items to output this cycle
I0320 15:24:23.409784 543705 cpu.go:275] no items to output this cycle
I0320 15:24:24.897677 543705 disk_info.go:125] begin check local disk info of client
I0320 15:24:24.900098 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:24:24.900104 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001abc40 0xc0001abc80]
E0320 15:24:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:24:33.409804 543705 memory.go:184] no items to output this cycle
I0320 15:24:33.409819 543705 cpu.go:275] no items to output this cycle
I0320 15:24:38.528873 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 15:24:38.528880 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 15:24:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:24:43.410755 543705 memory.go:191] Add success.
I0320 15:24:43.409815 543705 cpu.go:282] Add success.
I0320 15:24:43.420463 543705 net.go:648] Add success.
I0320 15:24:43.423289 543705 net.go:770] primary dev: ETH0
I0320 15:24:43.423302 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:24:43.423315 543705 net.go:698] Add success.
I0320 15:24:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:24:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:24:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:24:53.409895 543705 cpu.go:275] no items to output this cycle
E0320 15:24:53.409968 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:24:53.409978 543705 memory.go:184] no items to output this cycle
E0320 15:25:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:25:03.409796 543705 memory.go:184] no items to output this cycle
I0320 15:25:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 15:25:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:25:13.409804 543705 memory.go:191] Add success.
I0320 15:25:13.409805 543705 cpu.go:282] Add success.
W0320 15:25:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:25:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:25:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:25:13.420136 543705 net.go:648] Add success.
I0320 15:25:13.422704 543705 net.go:770] primary dev: ETH0
I0320 15:25:13.422730 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:25:13.422743 543705 net.go:698] Add success.
I0320 15:25:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:25:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:25:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 15:25:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:25:14.456590 543705 disk_worker.go:494] system disk:vda1
I0320 15:25:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:25:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:25:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:25:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:25:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:25:16.472394 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:25:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:25:23.409776 543705 memory.go:184] no items to output this cycle
I0320 15:25:23.409805 543705 cpu.go:275] no items to output this cycle
I0320 15:25:24.901674 543705 disk_info.go:125] begin check local disk info of client
I0320 15:25:24.904092 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:25:24.904097 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fbcc0 0xc0001fbd00]
E0320 15:25:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:25:33.409802 543705 memory.go:184] no items to output this cycle
I0320 15:25:33.409817 543705 cpu.go:275] no items to output this cycle
E0320 15:25:43.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:25:43.409831 543705 memory.go:191] Add success.
I0320 15:25:43.409836 543705 cpu.go:282] Add success.
I0320 15:25:43.419983 543705 net.go:648] Add success.
I0320 15:25:43.422858 543705 net.go:770] primary dev: ETH0
I0320 15:25:43.422870 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:25:43.422883 543705 net.go:698] Add success.
I0320 15:25:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:25:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:25:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:25:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:25:53.409796 543705 memory.go:184] no items to output this cycle
I0320 15:25:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 15:26:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:26:03.409794 543705 memory.go:184] no items to output this cycle
I0320 15:26:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 15:26:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:26:13.409811 543705 cpu.go:282] Add success.
I0320 15:26:13.409813 543705 memory.go:191] Add success.
W0320 15:26:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:26:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:26:13.409856 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:26:13.420249 543705 net.go:648] Add success.
I0320 15:26:13.423361 543705 net.go:770] primary dev: ETH0
I0320 15:26:13.423374 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:26:13.423385 543705 net.go:698] Add success.
I0320 15:26:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:26:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:26:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0320 15:26:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:26:14.456596 543705 disk_worker.go:494] system disk:vda1
I0320 15:26:14.456626 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:26:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:26:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:26:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:26:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:26:16.472367 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:26:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:26:23.409770 543705 memory.go:184] no items to output this cycle
I0320 15:26:23.409795 543705 cpu.go:275] no items to output this cycle
I0320 15:26:24.905673 543705 disk_info.go:125] begin check local disk info of client
I0320 15:26:24.908133 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:26:24.908140 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004719c0 0xc000471a00]
E0320 15:26:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:26:33.409783 543705 memory.go:184] no items to output this cycle
I0320 15:26:33.409789 543705 cpu.go:275] no items to output this cycle
E0320 15:26:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:26:43.409799 543705 memory.go:191] Add success.
I0320 15:26:43.409800 543705 cpu.go:282] Add success.
I0320 15:26:43.419973 543705 net.go:648] Add success.
I0320 15:26:43.423163 543705 net.go:770] primary dev: ETH0
I0320 15:26:43.423176 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:26:43.423188 543705 net.go:698] Add success.
I0320 15:26:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:26:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:26:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:26:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:26:53.409784 543705 cpu.go:275] no items to output this cycle
I0320 15:26:53.409788 543705 memory.go:184] no items to output this cycle
E0320 15:27:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:27:03.409806 543705 memory.go:184] no items to output this cycle
I0320 15:27:03.409819 543705 cpu.go:275] no items to output this cycle
E0320 15:27:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:27:13.409797 543705 memory.go:191] Add success.
I0320 15:27:13.409804 543705 cpu.go:282] Add success.
W0320 15:27:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:27:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:27:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:27:13.420195 543705 net.go:648] Add success.
I0320 15:27:13.429247 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 15:27:13.429322 543705 net.go:770] primary dev: ETH0
I0320 15:27:13.429334 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:27:13.429345 543705 net.go:698] Add success.
I0320 15:27:13.452939 543705 event_worker.go:152] Polling the log file for events...
I0320 15:27:13.463217 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0fcf64cf-706f-46f9-8497-c795fc3bfedc","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 15:27:13.463251 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 15:27:14.455220 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:27:14.455234 543705 disk_worker.go:708] disk space is not compliant
W0320 15:27:14.455238 543705 disk_worker.go:728] disk inode is not compliant
E0320 15:27:14.456068 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 15:27:14.456078 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 15:27:14.456084 543705 custom_config.go:64] query custom config with name: gpu
I0320 15:27:14.456928 543705 disk_worker.go:494] system disk:vda1
I0320 15:27:14.456955 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 15:27:15.456796 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 15:27:15.456805 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:27:16.457924 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 15:27:16.457924 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 15:27:16.457978 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:27:16.457998 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:27:16.472323 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:27:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:27:23.409763 543705 memory.go:184] no items to output this cycle
I0320 15:27:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 15:27:24.909671 543705 disk_info.go:125] begin check local disk info of client
I0320 15:27:24.912112 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:27:24.912117 543705 disk_info.go:196] parse disk info done, disk is : [0xc000536e80 0xc000536ec0]
E0320 15:27:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:27:33.409782 543705 memory.go:184] no items to output this cycle
I0320 15:27:33.409788 543705 cpu.go:275] no items to output this cycle
I0320 15:27:38.529025 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 15:27:38.529033 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 15:27:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:27:43.410788 543705 memory.go:191] Add success.
I0320 15:27:43.409808 543705 cpu.go:282] Add success.
I0320 15:27:43.420540 543705 net.go:648] Add success.
I0320 15:27:43.423410 543705 net.go:770] primary dev: ETH0
I0320 15:27:43.423424 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:27:43.423436 543705 net.go:698] Add success.
I0320 15:27:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:27:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:27:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:27:53.409902 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:27:53.409942 543705 cpu.go:275] no items to output this cycle
I0320 15:27:53.409967 543705 memory.go:184] no items to output this cycle
E0320 15:28:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:28:03.409794 543705 memory.go:184] no items to output this cycle
I0320 15:28:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 15:28:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:28:13.409786 543705 memory.go:191] Add success.
I0320 15:28:13.409805 543705 cpu.go:282] Add success.
W0320 15:28:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:28:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:28:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:28:13.420233 543705 net.go:648] Add success.
I0320 15:28:13.423079 543705 net.go:770] primary dev: ETH0
I0320 15:28:13.423092 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:28:13.423105 543705 net.go:698] Add success.
I0320 15:28:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:28:14.455143 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:28:14.455153 543705 disk_worker.go:708] disk space is not compliant
W0320 15:28:14.455156 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:28:14.456497 543705 disk_worker.go:494] system disk:vda1
I0320 15:28:14.456541 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:28:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:28:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:28:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:28:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:28:16.472374 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:28:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:28:23.409774 543705 memory.go:184] no items to output this cycle
I0320 15:28:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 15:28:24.913672 543705 disk_info.go:125] begin check local disk info of client
I0320 15:28:24.916103 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:28:24.916109 543705 disk_info.go:196] parse disk info done, disk is : [0xc000587100 0xc000587140]
E0320 15:28:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:28:33.409801 543705 memory.go:184] no items to output this cycle
I0320 15:28:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 15:28:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:28:43.409804 543705 memory.go:191] Add success.
I0320 15:28:43.409803 543705 cpu.go:282] Add success.
I0320 15:28:43.420061 543705 net.go:648] Add success.
I0320 15:28:43.422635 543705 net.go:770] primary dev: ETH0
I0320 15:28:43.422650 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:28:43.422665 543705 net.go:698] Add success.
I0320 15:28:46.457997 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:28:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:28:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:28:53.409858 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:28:53.409877 543705 memory.go:184] no items to output this cycle
I0320 15:28:53.410022 543705 cpu.go:275] no items to output this cycle
E0320 15:29:03.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:29:03.409763 543705 memory.go:184] no items to output this cycle
I0320 15:29:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 15:29:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:29:13.409794 543705 memory.go:191] Add success.
I0320 15:29:13.409804 543705 cpu.go:282] Add success.
W0320 15:29:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:29:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:29:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:29:13.420248 543705 net.go:648] Add success.
I0320 15:29:13.423195 543705 net.go:770] primary dev: ETH0
I0320 15:29:13.423209 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:29:13.423223 543705 net.go:698] Add success.
I0320 15:29:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:29:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:29:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 15:29:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:29:14.456583 543705 disk_worker.go:494] system disk:vda1
I0320 15:29:14.456613 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:29:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:29:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:29:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:29:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:29:16.472378 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:29:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:29:23.409791 543705 memory.go:184] no items to output this cycle
I0320 15:29:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 15:29:24.917675 543705 disk_info.go:125] begin check local disk info of client
I0320 15:29:24.920134 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:29:24.920140 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa880 0xc0001aa8c0]
E0320 15:29:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:29:33.409798 543705 memory.go:184] no items to output this cycle
I0320 15:29:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 15:29:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:29:43.409798 543705 memory.go:191] Add success.
I0320 15:29:43.409834 543705 cpu.go:282] Add success.
I0320 15:29:43.420061 543705 net.go:648] Add success.
I0320 15:29:43.422811 543705 net.go:770] primary dev: ETH0
I0320 15:29:43.422823 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:29:43.422835 543705 net.go:698] Add success.
I0320 15:29:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:29:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:29:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:29:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:29:53.409811 543705 memory.go:184] no items to output this cycle
I0320 15:29:53.409817 543705 cpu.go:275] no items to output this cycle
E0320 15:30:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:30:03.409775 543705 memory.go:184] no items to output this cycle
I0320 15:30:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 15:30:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:30:13.409787 543705 memory.go:191] Add success.
I0320 15:30:13.409809 543705 cpu.go:282] Add success.
W0320 15:30:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:30:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:30:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:30:13.420282 543705 net.go:648] Add success.
I0320 15:30:13.422966 543705 net.go:770] primary dev: ETH0
I0320 15:30:13.422979 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:30:13.422990 543705 net.go:698] Add success.
I0320 15:30:13.469138 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ba5a2913-e7a4-4779-8591-4fbd10f6516f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 15:30:13.469173 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 15:30:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:30:14.455207 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:30:14.455217 543705 disk_worker.go:708] disk space is not compliant
W0320 15:30:14.455220 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:30:14.456698 543705 disk_worker.go:494] system disk:vda1
I0320 15:30:14.456728 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:30:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:30:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:30:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:30:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:30:16.472381 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:30:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:30:23.409797 543705 memory.go:184] no items to output this cycle
I0320 15:30:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 15:30:24.921672 543705 disk_info.go:125] begin check local disk info of client
I0320 15:30:24.924121 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:30:24.924127 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab100 0xc0001ab140]
E0320 15:30:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:30:33.409774 543705 memory.go:184] no items to output this cycle
I0320 15:30:33.409792 543705 cpu.go:275] no items to output this cycle
I0320 15:30:38.529180 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 15:30:38.529187 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 15:30:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:30:43.410876 543705 memory.go:191] Add success.
I0320 15:30:43.409823 543705 cpu.go:282] Add success.
I0320 15:30:43.420676 543705 net.go:648] Add success.
I0320 15:30:43.423383 543705 net.go:770] primary dev: ETH0
I0320 15:30:43.423396 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:30:43.423408 543705 net.go:698] Add success.
I0320 15:30:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:30:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:30:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:30:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:30:53.409794 543705 memory.go:184] no items to output this cycle
I0320 15:30:53.409806 543705 cpu.go:275] no items to output this cycle
I0320 15:31:03.409876 543705 cpu.go:275] no items to output this cycle
E0320 15:31:03.409875 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:31:03.409893 543705 memory.go:184] no items to output this cycle
E0320 15:31:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:31:13.409789 543705 memory.go:191] Add success.
W0320 15:31:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 15:31:13.409817 543705 cpu.go:282] Add success.
W0320 15:31:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:31:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:31:13.420131 543705 net.go:648] Add success.
I0320 15:31:13.422815 543705 net.go:770] primary dev: ETH0
I0320 15:31:13.422830 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:31:13.422845 543705 net.go:698] Add success.
I0320 15:31:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:31:14.455206 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:31:14.455216 543705 disk_worker.go:708] disk space is not compliant
W0320 15:31:14.455219 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:31:14.456572 543705 disk_worker.go:494] system disk:vda1
I0320 15:31:14.456602 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:31:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:31:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:31:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:31:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:31:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:31:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:31:23.409777 543705 memory.go:184] no items to output this cycle
I0320 15:31:23.409781 543705 cpu.go:275] no items to output this cycle
I0320 15:31:24.925672 543705 disk_info.go:125] begin check local disk info of client
I0320 15:31:24.928075 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:31:24.928081 543705 disk_info.go:196] parse disk info done, disk is : [0xc000470940 0xc000470980]
E0320 15:31:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:31:33.409771 543705 memory.go:184] no items to output this cycle
I0320 15:31:33.409798 543705 cpu.go:275] no items to output this cycle
E0320 15:31:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:31:43.409819 543705 memory.go:191] Add success.
I0320 15:31:43.409828 543705 cpu.go:282] Add success.
I0320 15:31:43.419960 543705 net.go:648] Add success.
I0320 15:31:43.422774 543705 net.go:770] primary dev: ETH0
I0320 15:31:43.422787 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:31:43.422802 543705 net.go:698] Add success.
I0320 15:31:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:31:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:31:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:31:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:31:53.409806 543705 memory.go:184] no items to output this cycle
I0320 15:31:53.409815 543705 cpu.go:275] no items to output this cycle
E0320 15:32:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:32:03.409808 543705 memory.go:184] no items to output this cycle
I0320 15:32:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 15:32:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:32:13.409883 543705 memory.go:191] Add success.
W0320 15:32:13.409914 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:32:13.409927 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:32:13.409928 543705 cpu.go:282] Add success.
I0320 15:32:13.409930 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:32:13.419710 543705 net.go:648] Add success.
I0320 15:32:13.422218 543705 net.go:770] primary dev: ETH0
I0320 15:32:13.422243 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:32:13.422254 543705 net.go:698] Add success.
W0320 15:32:14.455091 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:32:14.455150 543705 disk_worker.go:708] disk space is not compliant
W0320 15:32:14.455153 543705 disk_worker.go:728] disk inode is not compliant
E0320 15:32:14.456911 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 15:32:14.456920 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 15:32:14.456926 543705 custom_config.go:64] query custom config with name: gpu
I0320 15:32:14.456994 543705 disk_worker.go:494] system disk:vda1
I0320 15:32:14.457023 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 15:32:15.456855 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 15:32:15.456866 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:32:16.457949 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 15:32:16.457957 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 15:32:16.458000 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:32:16.458018 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:32:16.472358 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:32:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:32:23.409777 543705 memory.go:184] no items to output this cycle
I0320 15:32:23.409779 543705 cpu.go:275] no items to output this cycle
I0320 15:32:24.929670 543705 disk_info.go:125] begin check local disk info of client
I0320 15:32:24.932092 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:32:24.932097 543705 disk_info.go:196] parse disk info done, disk is : [0xc00053a440 0xc00053a480]
E0320 15:32:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:32:33.409793 543705 memory.go:184] no items to output this cycle
I0320 15:32:33.409805 543705 cpu.go:275] no items to output this cycle
E0320 15:32:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:32:43.409790 543705 memory.go:191] Add success.
I0320 15:32:43.409808 543705 cpu.go:282] Add success.
I0320 15:32:43.419937 543705 net.go:648] Add success.
I0320 15:32:43.422508 543705 net.go:770] primary dev: ETH0
I0320 15:32:43.422520 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:32:43.422532 543705 net.go:698] Add success.
I0320 15:32:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:32:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:32:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:32:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:32:53.409763 543705 memory.go:184] no items to output this cycle
I0320 15:32:53.409822 543705 cpu.go:275] no items to output this cycle
E0320 15:33:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:33:03.409779 543705 memory.go:184] no items to output this cycle
I0320 15:33:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 15:33:13.409884 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:33:13.409913 543705 memory.go:191] Add success.
I0320 15:33:13.409933 543705 cpu.go:282] Add success.
W0320 15:33:13.409950 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:33:13.409968 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:33:13.409978 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:33:13.419708 543705 net.go:648] Add success.
I0320 15:33:13.422684 543705 net.go:770] primary dev: ETH0
I0320 15:33:13.422697 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:33:13.422708 543705 net.go:698] Add success.
I0320 15:33:13.469096 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"55d2669b-b5e9-45bd-86ea-c6342fd70c84","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 15:33:13.469135 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 15:33:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:33:14.455128 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:33:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0320 15:33:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:33:14.456679 543705 disk_worker.go:494] system disk:vda1
I0320 15:33:14.456709 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:33:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:33:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:33:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:33:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:33:16.472399 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:33:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:33:23.409776 543705 memory.go:184] no items to output this cycle
I0320 15:33:23.409779 543705 cpu.go:275] no items to output this cycle
I0320 15:33:24.933671 543705 disk_info.go:125] begin check local disk info of client
I0320 15:33:24.936168 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:33:24.936175 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aba80 0xc0001abac0]
E0320 15:33:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:33:33.409788 543705 memory.go:184] no items to output this cycle
I0320 15:33:33.409790 543705 cpu.go:275] no items to output this cycle
I0320 15:33:38.529358 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 15:33:38.529365 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 15:33:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:33:43.410547 543705 memory.go:191] Add success.
I0320 15:33:43.409805 543705 cpu.go:282] Add success.
I0320 15:33:43.420258 543705 net.go:648] Add success.
I0320 15:33:43.422574 543705 net.go:770] primary dev: ETH0
I0320 15:33:43.422588 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:33:43.422600 543705 net.go:698] Add success.
I0320 15:33:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:33:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:33:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:33:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:33:53.409778 543705 memory.go:184] no items to output this cycle
I0320 15:33:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 15:34:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:34:03.409800 543705 memory.go:184] no items to output this cycle
I0320 15:34:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 15:34:13.409866 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:34:13.409892 543705 memory.go:191] Add success.
W0320 15:34:13.409921 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:34:13.409933 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:34:13.409940 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:34:13.410051 543705 cpu.go:282] Add success.
I0320 15:34:13.419705 543705 net.go:648] Add success.
I0320 15:34:13.422690 543705 net.go:770] primary dev: ETH0
I0320 15:34:13.422702 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:34:13.422715 543705 net.go:698] Add success.
I0320 15:34:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:34:14.455098 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:34:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0320 15:34:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:34:14.456568 543705 disk_worker.go:494] system disk:vda1
I0320 15:34:14.456597 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:34:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:34:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:34:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:34:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:34:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:34:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:34:23.409795 543705 memory.go:184] no items to output this cycle
I0320 15:34:23.409806 543705 cpu.go:275] no items to output this cycle
I0320 15:34:24.937669 543705 disk_info.go:125] begin check local disk info of client
I0320 15:34:24.940094 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:34:24.940100 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b76c0 0xc0002b7700]
E0320 15:34:33.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:34:33.409761 543705 memory.go:184] no items to output this cycle
I0320 15:34:33.409793 543705 cpu.go:275] no items to output this cycle
E0320 15:34:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:34:43.409800 543705 memory.go:191] Add success.
I0320 15:34:43.409805 543705 cpu.go:282] Add success.
I0320 15:34:43.419956 543705 net.go:648] Add success.
I0320 15:34:43.422853 543705 net.go:770] primary dev: ETH0
I0320 15:34:43.422873 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:34:43.422890 543705 net.go:698] Add success.
I0320 15:34:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:34:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:34:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:34:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:34:53.409771 543705 memory.go:184] no items to output this cycle
I0320 15:34:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 15:35:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:35:03.409788 543705 memory.go:184] no items to output this cycle
I0320 15:35:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 15:35:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:35:13.409794 543705 memory.go:191] Add success.
I0320 15:35:13.409799 543705 cpu.go:282] Add success.
W0320 15:35:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:35:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:35:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:35:13.420062 543705 net.go:648] Add success.
I0320 15:35:13.422681 543705 net.go:770] primary dev: ETH0
I0320 15:35:13.422694 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:35:13.422707 543705 net.go:698] Add success.
I0320 15:35:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:35:14.455294 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:35:14.455387 543705 disk_worker.go:708] disk space is not compliant
W0320 15:35:14.455391 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:35:14.457042 543705 disk_worker.go:494] system disk:vda1
I0320 15:35:14.457084 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:35:15.455949 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:35:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:35:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:35:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:35:16.472386 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:35:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:35:23.409801 543705 memory.go:184] no items to output this cycle
I0320 15:35:23.409811 543705 cpu.go:275] no items to output this cycle
I0320 15:35:24.941668 543705 disk_info.go:125] begin check local disk info of client
I0320 15:35:24.944086 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:35:24.944092 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4200 0xc0000c4240]
E0320 15:35:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:35:33.409781 543705 memory.go:184] no items to output this cycle
I0320 15:35:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 15:35:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:35:43.409800 543705 memory.go:191] Add success.
I0320 15:35:43.409801 543705 cpu.go:282] Add success.
I0320 15:35:43.420122 543705 net.go:648] Add success.
I0320 15:35:43.423222 543705 net.go:770] primary dev: ETH0
I0320 15:35:43.423237 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:35:43.423250 543705 net.go:698] Add success.
I0320 15:35:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:35:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:35:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:35:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:35:53.409789 543705 memory.go:184] no items to output this cycle
I0320 15:35:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 15:36:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:36:03.409800 543705 memory.go:184] no items to output this cycle
I0320 15:36:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 15:36:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:36:13.409815 543705 memory.go:191] Add success.
I0320 15:36:13.409825 543705 cpu.go:282] Add success.
W0320 15:36:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:36:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:36:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:36:13.420212 543705 net.go:648] Add success.
I0320 15:36:13.423340 543705 net.go:770] primary dev: ETH0
I0320 15:36:13.423352 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:36:13.423366 543705 net.go:698] Add success.
I0320 15:36:13.468524 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a68f5021-5e33-44ae-b9c7-1c58f4c614ea","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 15:36:13.468573 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 15:36:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:36:14.455166 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:36:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0320 15:36:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:36:14.456602 543705 disk_worker.go:494] system disk:vda1
I0320 15:36:14.456632 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:36:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:36:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:36:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:36:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:36:16.472373 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:36:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:36:23.409795 543705 memory.go:184] no items to output this cycle
I0320 15:36:23.409807 543705 cpu.go:275] no items to output this cycle
I0320 15:36:24.945673 543705 disk_info.go:125] begin check local disk info of client
I0320 15:36:24.948206 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:36:24.948213 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa000 0xc0001fa040]
E0320 15:36:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:36:33.409767 543705 memory.go:184] no items to output this cycle
I0320 15:36:33.409804 543705 cpu.go:275] no items to output this cycle
I0320 15:36:38.530408 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 15:36:38.530416 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 15:36:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:36:43.410616 543705 memory.go:191] Add success.
I0320 15:36:43.409801 543705 cpu.go:282] Add success.
I0320 15:36:43.420320 543705 net.go:648] Add success.
I0320 15:36:43.422894 543705 net.go:770] primary dev: ETH0
I0320 15:36:43.422909 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:36:43.422922 543705 net.go:698] Add success.
I0320 15:36:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:36:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:36:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:36:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:36:53.409796 543705 memory.go:184] no items to output this cycle
I0320 15:36:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 15:37:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:37:03.409788 543705 memory.go:184] no items to output this cycle
I0320 15:37:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 15:37:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:37:13.409796 543705 memory.go:191] Add success.
I0320 15:37:13.409796 543705 cpu.go:282] Add success.
W0320 15:37:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:37:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:37:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:37:13.420070 543705 net.go:648] Add success.
I0320 15:37:13.422683 543705 net.go:770] primary dev: ETH0
I0320 15:37:13.422696 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:37:13.422710 543705 net.go:698] Add success.
I0320 15:37:13.453234 543705 event_worker.go:152] Polling the log file for events...
W0320 15:37:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:37:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 15:37:14.455183 543705 disk_worker.go:728] disk inode is not compliant
E0320 15:37:14.455870 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 15:37:14.455878 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 15:37:14.455884 543705 custom_config.go:64] query custom config with name: gpu
I0320 15:37:14.456558 543705 disk_worker.go:494] system disk:vda1
I0320 15:37:14.456587 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 15:37:15.456890 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 15:37:15.456900 543705 custom_config.go:64] query custom config with name: huawei_npu
E0320 15:37:16.457941 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 15:37:16.457944 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:37:16.457999 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:37:16.458017 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:37:16.472331 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:37:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:37:23.409795 543705 cpu.go:275] no items to output this cycle
I0320 15:37:23.409805 543705 memory.go:184] no items to output this cycle
I0320 15:37:24.949671 543705 disk_info.go:125] begin check local disk info of client
I0320 15:37:24.952177 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:37:24.952184 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039a640 0xc00039a680]
E0320 15:37:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:37:33.409765 543705 memory.go:184] no items to output this cycle
I0320 15:37:33.409791 543705 cpu.go:275] no items to output this cycle
E0320 15:37:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:37:43.409820 543705 memory.go:191] Add success.
I0320 15:37:43.409834 543705 cpu.go:282] Add success.
I0320 15:37:43.419960 543705 net.go:648] Add success.
I0320 15:37:43.422596 543705 net.go:770] primary dev: ETH0
I0320 15:37:43.422609 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:37:43.422622 543705 net.go:698] Add success.
I0320 15:37:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:37:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:37:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:37:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:37:53.409796 543705 cpu.go:275] no items to output this cycle
I0320 15:37:53.409803 543705 memory.go:184] no items to output this cycle
E0320 15:38:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:38:03.409809 543705 memory.go:184] no items to output this cycle
I0320 15:38:03.409819 543705 cpu.go:275] no items to output this cycle
E0320 15:38:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:38:13.409813 543705 memory.go:191] Add success.
I0320 15:38:13.409820 543705 cpu.go:282] Add success.
W0320 15:38:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:38:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:38:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:38:13.420042 543705 net.go:648] Add success.
I0320 15:38:13.422691 543705 net.go:770] primary dev: ETH0
I0320 15:38:13.422704 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:38:13.422715 543705 net.go:698] Add success.
I0320 15:38:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:38:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:38:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0320 15:38:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:38:14.456589 543705 disk_worker.go:494] system disk:vda1
I0320 15:38:14.456619 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:38:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:38:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:38:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:38:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:38:16.472426 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:38:23.410480 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:38:23.410500 543705 memory.go:184] no items to output this cycle
I0320 15:38:23.410499 543705 cpu.go:275] no items to output this cycle
I0320 15:38:24.953695 543705 disk_info.go:125] begin check local disk info of client
I0320 15:38:24.956128 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:38:24.956134 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa000 0xc0001fa040]
E0320 15:38:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:38:33.409786 543705 memory.go:184] no items to output this cycle
I0320 15:38:33.409798 543705 cpu.go:275] no items to output this cycle
E0320 15:38:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:38:43.409805 543705 cpu.go:282] Add success.
I0320 15:38:43.409816 543705 memory.go:191] Add success.
I0320 15:38:43.419974 543705 net.go:648] Add success.
I0320 15:38:43.422348 543705 net.go:770] primary dev: ETH0
I0320 15:38:43.422361 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:38:43.422374 543705 net.go:698] Add success.
I0320 15:38:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:38:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:38:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:38:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:38:53.409803 543705 memory.go:184] no items to output this cycle
I0320 15:38:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 15:39:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:39:03.409807 543705 memory.go:184] no items to output this cycle
I0320 15:39:03.409823 543705 cpu.go:275] no items to output this cycle
E0320 15:39:13.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:39:13.409776 543705 memory.go:191] Add success.
I0320 15:39:13.409798 543705 cpu.go:282] Add success.
W0320 15:39:13.409801 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:39:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:39:13.409817 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:39:13.420237 543705 net.go:648] Add success.
I0320 15:39:13.423137 543705 net.go:770] primary dev: ETH0
I0320 15:39:13.423150 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:39:13.423163 543705 net.go:698] Add success.
I0320 15:39:13.469468 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ec2d6c55-3538-4809-9b5c-7ea05f2f7927","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 15:39:13.469501 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 15:39:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:39:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:39:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0320 15:39:14.455180 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:39:14.456537 543705 disk_worker.go:494] system disk:vda1
I0320 15:39:14.456597 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:39:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:39:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:39:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:39:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:39:16.472455 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:39:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:39:23.409765 543705 memory.go:184] no items to output this cycle
I0320 15:39:23.409918 543705 cpu.go:275] no items to output this cycle
I0320 15:39:24.957917 543705 disk_info.go:125] begin check local disk info of client
I0320 15:39:24.960439 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:39:24.960445 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007a000 0xc00007a700]
E0320 15:39:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:39:33.409774 543705 memory.go:184] no items to output this cycle
I0320 15:39:33.409790 543705 cpu.go:275] no items to output this cycle
I0320 15:39:38.531428 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 15:39:38.531435 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 15:39:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:39:43.410619 543705 memory.go:191] Add success.
I0320 15:39:43.409802 543705 cpu.go:282] Add success.
I0320 15:39:43.420329 543705 net.go:648] Add success.
I0320 15:39:43.422844 543705 net.go:770] primary dev: ETH0
I0320 15:39:43.422858 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:39:43.422871 543705 net.go:698] Add success.
I0320 15:39:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:39:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:39:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:39:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:39:53.409779 543705 memory.go:184] no items to output this cycle
I0320 15:39:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 15:40:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:40:03.409807 543705 memory.go:184] no items to output this cycle
I0320 15:40:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 15:40:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:40:13.409786 543705 memory.go:191] Add success.
I0320 15:40:13.409809 543705 cpu.go:282] Add success.
W0320 15:40:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:40:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:40:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:40:13.420128 543705 net.go:648] Add success.
I0320 15:40:13.422787 543705 net.go:770] primary dev: ETH0
I0320 15:40:13.422801 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:40:13.422815 543705 net.go:698] Add success.
I0320 15:40:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:40:14.455099 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:40:14.455163 543705 disk_worker.go:708] disk space is not compliant
W0320 15:40:14.455166 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:40:14.456495 543705 disk_worker.go:494] system disk:vda1
I0320 15:40:14.456539 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:40:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:40:16.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:40:16.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:40:16.458077 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:40:16.472430 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:40:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:40:23.409782 543705 memory.go:184] no items to output this cycle
I0320 15:40:23.409786 543705 cpu.go:275] no items to output this cycle
I0320 15:40:24.961670 543705 disk_info.go:125] begin check local disk info of client
I0320 15:40:24.964177 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:40:24.964184 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bc000 0xc0002bc040]
E0320 15:40:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:40:33.409780 543705 memory.go:184] no items to output this cycle
I0320 15:40:33.409809 543705 cpu.go:275] no items to output this cycle
E0320 15:40:43.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:40:43.409837 543705 memory.go:191] Add success.
I0320 15:40:43.409839 543705 cpu.go:282] Add success.
I0320 15:40:43.419797 543705 net.go:770] primary dev: ETH0
I0320 15:40:43.419813 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:40:43.419829 543705 net.go:698] Add success.
I0320 15:40:43.420181 543705 net.go:648] Add success.
I0320 15:40:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:40:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:40:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:40:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:40:53.409777 543705 memory.go:184] no items to output this cycle
I0320 15:40:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 15:41:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:41:03.409798 543705 memory.go:184] no items to output this cycle
I0320 15:41:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 15:41:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:41:13.409794 543705 memory.go:191] Add success.
I0320 15:41:13.409799 543705 cpu.go:282] Add success.
W0320 15:41:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:41:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:41:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:41:13.420147 543705 net.go:648] Add success.
I0320 15:41:13.422852 543705 net.go:770] primary dev: ETH0
I0320 15:41:13.422865 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:41:13.422878 543705 net.go:698] Add success.
I0320 15:41:14.454955 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:41:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:41:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 15:41:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:41:14.456568 543705 disk_worker.go:494] system disk:vda1
I0320 15:41:14.456598 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:41:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:41:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:41:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:41:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:41:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:41:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:41:23.409787 543705 memory.go:184] no items to output this cycle
I0320 15:41:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 15:41:24.965664 543705 disk_info.go:125] begin check local disk info of client
I0320 15:41:24.968108 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:41:24.968114 543705 disk_info.go:196] parse disk info done, disk is : [0xc000387d00 0xc000387d40]
E0320 15:41:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:41:33.409894 543705 memory.go:184] no items to output this cycle
I0320 15:41:33.409964 543705 cpu.go:275] no items to output this cycle
E0320 15:41:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:41:43.409812 543705 memory.go:191] Add success.
I0320 15:41:43.409848 543705 cpu.go:282] Add success.
I0320 15:41:43.419992 543705 net.go:648] Add success.
I0320 15:41:43.423103 543705 net.go:770] primary dev: ETH0
I0320 15:41:43.423116 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:41:43.423129 543705 net.go:698] Add success.
I0320 15:41:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:41:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:41:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:41:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:41:53.409775 543705 memory.go:184] no items to output this cycle
I0320 15:41:53.409822 543705 cpu.go:275] no items to output this cycle
E0320 15:42:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:42:03.409819 543705 memory.go:184] no items to output this cycle
I0320 15:42:03.409830 543705 cpu.go:275] no items to output this cycle
E0320 15:42:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:42:13.409795 543705 memory.go:191] Add success.
I0320 15:42:13.409797 543705 cpu.go:282] Add success.
W0320 15:42:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:42:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:42:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:42:13.420103 543705 net.go:648] Add success.
I0320 15:42:13.422699 543705 net.go:770] primary dev: ETH0
I0320 15:42:13.422714 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:42:13.422731 543705 net.go:698] Add success.
I0320 15:42:13.468777 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2685224a-c7a9-4f3c-9119-c4da11c2df96","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 15:42:13.468809 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 15:42:14.455220 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:42:14.455238 543705 disk_worker.go:708] disk space is not compliant
W0320 15:42:14.455242 543705 disk_worker.go:728] disk inode is not compliant
E0320 15:42:14.456066 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 15:42:14.456075 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 15:42:14.456081 543705 custom_config.go:64] query custom config with name: gpu
I0320 15:42:14.457037 543705 disk_worker.go:494] system disk:vda1
I0320 15:42:14.457067 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 15:42:15.456800 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 15:42:15.456808 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:42:16.457949 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 15:42:16.457949 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 15:42:16.458001 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:42:16.458020 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:42:16.472347 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:42:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:42:23.409780 543705 memory.go:184] no items to output this cycle
I0320 15:42:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 15:42:24.969672 543705 disk_info.go:125] begin check local disk info of client
I0320 15:42:24.972155 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:42:24.972161 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003de900 0xc0003de940]
E0320 15:42:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:42:33.409781 543705 memory.go:184] no items to output this cycle
I0320 15:42:33.409783 543705 cpu.go:275] no items to output this cycle
I0320 15:42:38.532431 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 15:42:38.532438 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 15:42:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:42:43.410513 543705 memory.go:191] Add success.
I0320 15:42:43.409805 543705 cpu.go:282] Add success.
I0320 15:42:43.420286 543705 net.go:648] Add success.
I0320 15:42:43.422678 543705 net.go:770] primary dev: ETH0
I0320 15:42:43.422693 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:42:43.422708 543705 net.go:698] Add success.
I0320 15:42:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:42:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:42:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:42:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:42:53.409806 543705 memory.go:184] no items to output this cycle
I0320 15:42:53.409816 543705 cpu.go:275] no items to output this cycle
E0320 15:43:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:43:03.409786 543705 memory.go:184] no items to output this cycle
I0320 15:43:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 15:43:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:43:13.409784 543705 memory.go:191] Add success.
W0320 15:43:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 15:43:13.409809 543705 cpu.go:282] Add success.
W0320 15:43:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:43:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:43:13.420124 543705 net.go:648] Add success.
I0320 15:43:13.423064 543705 net.go:770] primary dev: ETH0
I0320 15:43:13.423078 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:43:13.423093 543705 net.go:698] Add success.
I0320 15:43:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:43:14.455118 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:43:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 15:43:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:43:14.456575 543705 disk_worker.go:494] system disk:vda1
I0320 15:43:14.456606 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:43:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:43:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:43:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:43:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:43:16.472434 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:43:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:43:23.409780 543705 memory.go:184] no items to output this cycle
I0320 15:43:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 15:43:24.973664 543705 disk_info.go:125] begin check local disk info of client
I0320 15:43:24.976197 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:43:24.976203 543705 disk_info.go:196] parse disk info done, disk is : [0xc00036a180 0xc00036a1c0]
E0320 15:43:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:43:33.409795 543705 memory.go:184] no items to output this cycle
I0320 15:43:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 15:43:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:43:43.409792 543705 memory.go:191] Add success.
I0320 15:43:43.409972 543705 cpu.go:282] Add success.
I0320 15:43:43.419741 543705 net.go:648] Add success.
I0320 15:43:43.422776 543705 net.go:770] primary dev: ETH0
I0320 15:43:43.422791 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:43:43.422805 543705 net.go:698] Add success.
I0320 15:43:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:43:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:43:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:43:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:43:53.409795 543705 memory.go:184] no items to output this cycle
I0320 15:43:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 15:44:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:44:03.409777 543705 memory.go:184] no items to output this cycle
I0320 15:44:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 15:44:13.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:44:13.409770 543705 memory.go:191] Add success.
W0320 15:44:13.409796 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:44:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:44:13.409810 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:44:13.409819 543705 cpu.go:282] Add success.
I0320 15:44:13.420158 543705 net.go:648] Add success.
I0320 15:44:13.422745 543705 net.go:770] primary dev: ETH0
I0320 15:44:13.422760 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:44:13.422774 543705 net.go:698] Add success.
I0320 15:44:14.454219 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:44:14.454394 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:44:14.454404 543705 disk_worker.go:708] disk space is not compliant
W0320 15:44:14.454407 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:44:14.455747 543705 disk_worker.go:494] system disk:vda1
I0320 15:44:14.455794 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:44:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:44:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:44:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:44:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:44:16.472377 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:44:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:44:23.409772 543705 memory.go:184] no items to output this cycle
I0320 15:44:23.409786 543705 cpu.go:275] no items to output this cycle
I0320 15:44:24.977672 543705 disk_info.go:125] begin check local disk info of client
I0320 15:44:24.980112 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:44:24.980118 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329640 0xc000329680]
E0320 15:44:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:44:33.409781 543705 memory.go:184] no items to output this cycle
I0320 15:44:33.409781 543705 cpu.go:275] no items to output this cycle
E0320 15:44:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:44:43.409789 543705 memory.go:191] Add success.
I0320 15:44:43.409811 543705 cpu.go:282] Add success.
I0320 15:44:43.419955 543705 net.go:648] Add success.
I0320 15:44:43.422609 543705 net.go:770] primary dev: ETH0
I0320 15:44:43.422620 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:44:43.422632 543705 net.go:698] Add success.
I0320 15:44:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:44:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:44:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:44:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:44:53.409784 543705 memory.go:184] no items to output this cycle
I0320 15:44:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 15:45:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:45:03.409773 543705 memory.go:184] no items to output this cycle
I0320 15:45:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 15:45:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:45:13.409798 543705 cpu.go:282] Add success.
I0320 15:45:13.409799 543705 memory.go:191] Add success.
W0320 15:45:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:45:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:45:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:45:13.420583 543705 net.go:648] Add success.
I0320 15:45:13.423444 543705 net.go:770] primary dev: ETH0
I0320 15:45:13.423457 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:45:13.423469 543705 net.go:698] Add success.
I0320 15:45:13.470171 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"36febf27-4fdf-434a-9215-c04167b0584f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 15:45:13.470208 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 15:45:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:45:14.455166 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:45:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0320 15:45:14.455179 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:45:14.456532 543705 disk_worker.go:494] system disk:vda1
I0320 15:45:14.456584 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:45:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:45:16.457994 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:45:16.458065 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:45:16.458090 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:45:16.472402 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:45:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:45:23.409797 543705 memory.go:184] no items to output this cycle
I0320 15:45:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 15:45:24.981675 543705 disk_info.go:125] begin check local disk info of client
I0320 15:45:24.984123 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:45:24.984129 543705 disk_info.go:196] parse disk info done, disk is : [0xc000314100 0xc000314140]
E0320 15:45:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:45:33.409807 543705 memory.go:184] no items to output this cycle
I0320 15:45:33.409820 543705 cpu.go:275] no items to output this cycle
I0320 15:45:38.533433 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 15:45:38.533440 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 15:45:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:45:43.410679 543705 memory.go:191] Add success.
I0320 15:45:43.409807 543705 cpu.go:282] Add success.
I0320 15:45:43.420399 543705 net.go:648] Add success.
I0320 15:45:43.423206 543705 net.go:770] primary dev: ETH0
I0320 15:45:43.423233 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:45:43.423252 543705 net.go:698] Add success.
I0320 15:45:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:45:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:45:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:45:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:45:53.409787 543705 memory.go:184] no items to output this cycle
I0320 15:45:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 15:46:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:46:03.409774 543705 memory.go:184] no items to output this cycle
I0320 15:46:03.409788 543705 cpu.go:275] no items to output this cycle
W0320 15:46:13.409708 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:46:13.409729 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:46:13.409735 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:46:13.409826 543705 cpu.go:282] Add success.
E0320 15:46:13.409829 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:46:13.409845 543705 memory.go:191] Add success.
I0320 15:46:13.419988 543705 net.go:648] Add success.
I0320 15:46:13.422853 543705 net.go:770] primary dev: ETH0
I0320 15:46:13.422868 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:46:13.422882 543705 net.go:698] Add success.
I0320 15:46:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:46:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:46:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 15:46:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:46:14.456575 543705 disk_worker.go:494] system disk:vda1
I0320 15:46:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:46:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:46:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:46:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:46:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:46:16.472402 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:46:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:46:23.409793 543705 memory.go:184] no items to output this cycle
I0320 15:46:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 15:46:24.985673 543705 disk_info.go:125] begin check local disk info of client
I0320 15:46:24.988132 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:46:24.988138 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4dc0 0xc0000c4e00]
E0320 15:46:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:46:33.409798 543705 memory.go:184] no items to output this cycle
I0320 15:46:33.409814 543705 cpu.go:275] no items to output this cycle
E0320 15:46:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:46:43.409788 543705 memory.go:191] Add success.
I0320 15:46:43.409811 543705 cpu.go:282] Add success.
I0320 15:46:43.419874 543705 net.go:648] Add success.
I0320 15:46:43.422814 543705 net.go:770] primary dev: ETH0
I0320 15:46:43.422828 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:46:43.422841 543705 net.go:698] Add success.
I0320 15:46:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:46:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:46:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:46:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:46:53.409772 543705 memory.go:184] no items to output this cycle
I0320 15:46:53.409877 543705 cpu.go:275] no items to output this cycle
E0320 15:47:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:47:03.409784 543705 cpu.go:275] no items to output this cycle
I0320 15:47:03.409793 543705 memory.go:184] no items to output this cycle
E0320 15:47:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:47:13.409788 543705 memory.go:191] Add success.
I0320 15:47:13.409804 543705 cpu.go:282] Add success.
W0320 15:47:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:47:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:47:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:47:13.420464 543705 net.go:648] Add success.
I0320 15:47:13.423251 543705 net.go:770] primary dev: ETH0
I0320 15:47:13.423264 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:47:13.423276 543705 net.go:698] Add success.
I0320 15:47:13.452940 543705 event_worker.go:152] Polling the log file for events...
W0320 15:47:14.455079 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:47:14.455138 543705 disk_worker.go:708] disk space is not compliant
W0320 15:47:14.455141 543705 disk_worker.go:728] disk inode is not compliant
E0320 15:47:14.456885 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 15:47:14.456893 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 15:47:14.456900 543705 custom_config.go:64] query custom config with name: gpu
I0320 15:47:14.456972 543705 disk_worker.go:494] system disk:vda1
I0320 15:47:14.457014 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 15:47:15.456831 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 15:47:15.456839 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:47:16.457912 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 15:47:16.457912 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 15:47:16.457968 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:47:16.457987 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:47:16.472304 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:47:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:47:23.409793 543705 memory.go:184] no items to output this cycle
I0320 15:47:23.409805 543705 cpu.go:275] no items to output this cycle
I0320 15:47:24.989671 543705 disk_info.go:125] begin check local disk info of client
I0320 15:47:24.992095 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:47:24.992101 543705 disk_info.go:196] parse disk info done, disk is : [0xc000587ac0 0xc000587b00]
E0320 15:47:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:47:33.409783 543705 memory.go:184] no items to output this cycle
I0320 15:47:33.409786 543705 cpu.go:275] no items to output this cycle
E0320 15:47:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:47:43.409794 543705 memory.go:191] Add success.
I0320 15:47:43.409812 543705 cpu.go:282] Add success.
I0320 15:47:43.419885 543705 net.go:648] Add success.
I0320 15:47:43.422542 543705 net.go:770] primary dev: ETH0
I0320 15:47:43.422555 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:47:43.422567 543705 net.go:698] Add success.
I0320 15:47:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:47:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:47:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:47:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:47:53.409777 543705 memory.go:184] no items to output this cycle
I0320 15:47:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 15:48:03.409841 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:48:03.409862 543705 memory.go:184] no items to output this cycle
I0320 15:48:03.409972 543705 cpu.go:275] no items to output this cycle
E0320 15:48:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:48:13.409786 543705 memory.go:191] Add success.
I0320 15:48:13.409791 543705 cpu.go:282] Add success.
W0320 15:48:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:48:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:48:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:48:13.420088 543705 net.go:648] Add success.
I0320 15:48:13.422691 543705 net.go:770] primary dev: ETH0
I0320 15:48:13.422703 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:48:13.422715 543705 net.go:698] Add success.
I0320 15:48:13.967068 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3cfac21e-2f48-421a-85fc-6d8611001870","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 15:48:13.967102 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 15:48:14.454634 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:48:14.454865 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:48:14.454875 543705 disk_worker.go:708] disk space is not compliant
W0320 15:48:14.454878 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:48:14.456305 543705 disk_worker.go:494] system disk:vda1
I0320 15:48:14.456351 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:48:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:48:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:48:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:48:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:48:16.472421 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:48:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:48:23.409776 543705 memory.go:184] no items to output this cycle
I0320 15:48:23.409779 543705 cpu.go:275] no items to output this cycle
I0320 15:48:24.993668 543705 disk_info.go:125] begin check local disk info of client
I0320 15:48:24.996116 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:48:24.996121 543705 disk_info.go:196] parse disk info done, disk is : [0xc000471500 0xc000471540]
E0320 15:48:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:48:33.409800 543705 memory.go:184] no items to output this cycle
I0320 15:48:33.409809 543705 cpu.go:275] no items to output this cycle
I0320 15:48:38.534441 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 15:48:38.534449 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 15:48:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:48:43.410852 543705 memory.go:191] Add success.
I0320 15:48:43.409801 543705 cpu.go:282] Add success.
I0320 15:48:43.420599 543705 net.go:648] Add success.
I0320 15:48:43.423688 543705 net.go:770] primary dev: ETH0
I0320 15:48:43.423701 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:48:43.423715 543705 net.go:698] Add success.
I0320 15:48:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:48:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:48:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:48:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:48:53.409771 543705 memory.go:184] no items to output this cycle
I0320 15:48:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 15:49:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:49:03.409778 543705 memory.go:184] no items to output this cycle
I0320 15:49:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 15:49:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:49:13.409809 543705 memory.go:191] Add success.
I0320 15:49:13.409813 543705 cpu.go:282] Add success.
W0320 15:49:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:49:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:49:13.409860 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:49:13.420146 543705 net.go:648] Add success.
I0320 15:49:13.422919 543705 net.go:770] primary dev: ETH0
I0320 15:49:13.422931 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:49:13.422943 543705 net.go:698] Add success.
I0320 15:49:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:49:14.455202 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:49:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0320 15:49:14.455217 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:49:14.456585 543705 disk_worker.go:494] system disk:vda1
I0320 15:49:14.456613 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:49:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:49:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:49:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:49:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:49:16.472375 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:49:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:49:23.409765 543705 memory.go:184] no items to output this cycle
I0320 15:49:23.409799 543705 cpu.go:275] no items to output this cycle
I0320 15:49:24.997671 543705 disk_info.go:125] begin check local disk info of client
I0320 15:49:25.000105 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:49:25.000110 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab8c0 0xc0001ab900]
E0320 15:49:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:49:33.409801 543705 memory.go:184] no items to output this cycle
I0320 15:49:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 15:49:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:49:43.409788 543705 memory.go:191] Add success.
I0320 15:49:43.409819 543705 cpu.go:282] Add success.
I0320 15:49:43.419988 543705 net.go:648] Add success.
I0320 15:49:43.422588 543705 net.go:770] primary dev: ETH0
I0320 15:49:43.422602 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:49:43.422613 543705 net.go:698] Add success.
I0320 15:49:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:49:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:49:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:49:53.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:49:53.409811 543705 memory.go:184] no items to output this cycle
I0320 15:49:53.409822 543705 cpu.go:275] no items to output this cycle
E0320 15:50:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:50:03.409878 543705 cpu.go:275] no items to output this cycle
I0320 15:50:03.409894 543705 memory.go:184] no items to output this cycle
E0320 15:50:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:50:13.409809 543705 memory.go:191] Add success.
I0320 15:50:13.409823 543705 cpu.go:282] Add success.
W0320 15:50:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:50:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:50:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:50:13.420187 543705 net.go:648] Add success.
I0320 15:50:13.422815 543705 net.go:770] primary dev: ETH0
I0320 15:50:13.422830 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:50:13.422844 543705 net.go:698] Add success.
I0320 15:50:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:50:14.455133 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:50:14.455143 543705 disk_worker.go:708] disk space is not compliant
W0320 15:50:14.455146 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:50:14.456488 543705 disk_worker.go:494] system disk:vda1
I0320 15:50:14.456533 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:50:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:50:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:50:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:50:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:50:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:50:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:50:23.409777 543705 memory.go:184] no items to output this cycle
I0320 15:50:23.409795 543705 cpu.go:275] no items to output this cycle
I0320 15:50:25.001672 543705 disk_info.go:125] begin check local disk info of client
I0320 15:50:25.004153 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:50:25.004159 543705 disk_info.go:196] parse disk info done, disk is : [0xc0005867c0 0xc000586800]
E0320 15:50:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:50:33.409791 543705 memory.go:184] no items to output this cycle
I0320 15:50:33.409803 543705 cpu.go:275] no items to output this cycle
E0320 15:50:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:50:43.409787 543705 memory.go:191] Add success.
I0320 15:50:43.409809 543705 cpu.go:282] Add success.
I0320 15:50:43.419869 543705 net.go:648] Add success.
I0320 15:50:43.422591 543705 net.go:770] primary dev: ETH0
I0320 15:50:43.422605 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:50:43.422617 543705 net.go:698] Add success.
I0320 15:50:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:50:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:50:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:50:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:50:53.409784 543705 memory.go:184] no items to output this cycle
I0320 15:50:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 15:51:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:51:03.409782 543705 memory.go:184] no items to output this cycle
I0320 15:51:03.409785 543705 cpu.go:275] no items to output this cycle
I0320 15:51:13.409890 543705 cpu.go:282] Add success.
E0320 15:51:13.410031 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:51:13.410049 543705 memory.go:191] Add success.
W0320 15:51:13.410076 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:51:13.410089 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:51:13.410092 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:51:13.419703 543705 net.go:648] Add success.
I0320 15:51:13.423007 543705 net.go:770] primary dev: ETH0
I0320 15:51:13.423019 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:51:13.423030 543705 net.go:698] Add success.
I0320 15:51:13.472370 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3e30de48-d97b-41c4-a2e4-4b5b867907b5","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 15:51:13.472404 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 15:51:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:51:14.455169 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:51:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 15:51:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:51:14.456679 543705 disk_worker.go:494] system disk:vda1
I0320 15:51:14.456707 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:51:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:51:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:51:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:51:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:51:16.472364 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:51:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:51:23.409776 543705 cpu.go:275] no items to output this cycle
I0320 15:51:23.409787 543705 memory.go:184] no items to output this cycle
I0320 15:51:25.005671 543705 disk_info.go:125] begin check local disk info of client
I0320 15:51:25.008133 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:51:25.008139 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007acc0 0xc00007ad00]
E0320 15:51:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:51:33.409799 543705 memory.go:184] no items to output this cycle
I0320 15:51:33.409811 543705 cpu.go:275] no items to output this cycle
I0320 15:51:38.535450 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 15:51:38.535457 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 15:51:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:51:43.410715 543705 memory.go:191] Add success.
I0320 15:51:43.409793 543705 cpu.go:282] Add success.
I0320 15:51:43.420413 543705 net.go:648] Add success.
I0320 15:51:43.423201 543705 net.go:770] primary dev: ETH0
I0320 15:51:43.423214 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:51:43.423228 543705 net.go:698] Add success.
I0320 15:51:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:51:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:51:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:51:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:51:53.409791 543705 cpu.go:275] no items to output this cycle
I0320 15:51:53.409793 543705 memory.go:184] no items to output this cycle
E0320 15:52:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:52:03.409781 543705 memory.go:184] no items to output this cycle
I0320 15:52:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 15:52:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:52:13.409796 543705 memory.go:191] Add success.
I0320 15:52:13.409796 543705 cpu.go:282] Add success.
W0320 15:52:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:52:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:52:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:52:13.420150 543705 net.go:648] Add success.
I0320 15:52:13.422710 543705 net.go:770] primary dev: ETH0
I0320 15:52:13.422723 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:52:13.422735 543705 net.go:698] Add success.
W0320 15:52:14.455101 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:52:14.455164 543705 disk_worker.go:708] disk space is not compliant
W0320 15:52:14.455167 543705 disk_worker.go:728] disk inode is not compliant
E0320 15:52:14.456948 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 15:52:14.456957 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 15:52:14.456963 543705 custom_config.go:64] query custom config with name: gpu
I0320 15:52:14.457009 543705 disk_worker.go:494] system disk:vda1
I0320 15:52:14.457051 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 15:52:15.456835 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 15:52:15.456845 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:52:16.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 15:52:16.457999 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 15:52:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:52:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:52:16.472396 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:52:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:52:23.409778 543705 memory.go:184] no items to output this cycle
I0320 15:52:23.409781 543705 cpu.go:275] no items to output this cycle
I0320 15:52:25.009670 543705 disk_info.go:125] begin check local disk info of client
I0320 15:52:25.012067 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:52:25.012072 543705 disk_info.go:196] parse disk info done, disk is : [0xc00053c200 0xc00053c240]
E0320 15:52:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:52:33.409785 543705 memory.go:184] no items to output this cycle
I0320 15:52:33.409786 543705 cpu.go:275] no items to output this cycle
E0320 15:52:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:52:43.409794 543705 memory.go:191] Add success.
I0320 15:52:43.409799 543705 cpu.go:282] Add success.
I0320 15:52:43.419783 543705 net.go:770] primary dev: ETH0
I0320 15:52:43.419796 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:52:43.419809 543705 net.go:698] Add success.
I0320 15:52:43.420154 543705 net.go:648] Add success.
I0320 15:52:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:52:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:52:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:52:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:52:53.409771 543705 memory.go:184] no items to output this cycle
I0320 15:52:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 15:53:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:53:03.409808 543705 memory.go:184] no items to output this cycle
I0320 15:53:03.409819 543705 cpu.go:275] no items to output this cycle
E0320 15:53:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:53:13.409784 543705 memory.go:191] Add success.
I0320 15:53:13.409804 543705 cpu.go:282] Add success.
W0320 15:53:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:53:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:53:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:53:13.420166 543705 net.go:648] Add success.
I0320 15:53:13.423317 543705 net.go:770] primary dev: ETH0
I0320 15:53:13.423332 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:53:13.423346 543705 net.go:698] Add success.
I0320 15:53:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:53:14.455193 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:53:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0320 15:53:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:53:14.456594 543705 disk_worker.go:494] system disk:vda1
I0320 15:53:14.456624 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:53:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:53:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:53:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:53:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:53:16.472378 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:53:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:53:23.409794 543705 memory.go:184] no items to output this cycle
I0320 15:53:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 15:53:25.013677 543705 disk_info.go:125] begin check local disk info of client
I0320 15:53:25.016097 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:53:25.016102 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab1c0 0xc0001ab200]
E0320 15:53:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:53:33.409799 543705 memory.go:184] no items to output this cycle
I0320 15:53:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 15:53:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:53:43.409786 543705 memory.go:191] Add success.
I0320 15:53:43.409809 543705 cpu.go:282] Add success.
I0320 15:53:43.419994 543705 net.go:648] Add success.
I0320 15:53:43.422540 543705 net.go:770] primary dev: ETH0
I0320 15:53:43.422553 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:53:43.422566 543705 net.go:698] Add success.
I0320 15:53:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:53:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:53:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:53:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:53:53.409779 543705 memory.go:184] no items to output this cycle
I0320 15:53:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 15:54:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:54:03.409775 543705 memory.go:184] no items to output this cycle
I0320 15:54:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 15:54:13.409858 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:54:13.409895 543705 memory.go:191] Add success.
W0320 15:54:13.409957 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 15:54:13.409970 543705 cpu.go:282] Add success.
W0320 15:54:13.409982 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:54:13.409986 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:54:13.419742 543705 net.go:648] Add success.
I0320 15:54:13.422222 543705 net.go:770] primary dev: ETH0
I0320 15:54:13.422235 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:54:13.422246 543705 net.go:698] Add success.
I0320 15:54:13.577306 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"519067a3-eb52-43ca-94e1-a19ca95446da","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 15:54:13.577338 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 15:54:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:54:14.455091 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:54:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0320 15:54:14.455161 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:54:14.456486 543705 disk_worker.go:494] system disk:vda1
I0320 15:54:14.456514 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:54:15.455605 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:54:16.457998 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:54:16.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:54:16.458083 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:54:16.472401 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:54:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:54:23.409793 543705 memory.go:184] no items to output this cycle
I0320 15:54:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 15:54:25.017685 543705 disk_info.go:125] begin check local disk info of client
I0320 15:54:25.020193 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:54:25.020199 543705 disk_info.go:196] parse disk info done, disk is : [0xc000464e40 0xc000464e80]
E0320 15:54:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:54:33.409786 543705 cpu.go:275] no items to output this cycle
I0320 15:54:33.409788 543705 memory.go:184] no items to output this cycle
I0320 15:54:38.536444 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 15:54:38.536452 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 15:54:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:54:43.410669 543705 memory.go:191] Add success.
I0320 15:54:43.409799 543705 cpu.go:282] Add success.
I0320 15:54:43.420542 543705 net.go:648] Add success.
I0320 15:54:43.423172 543705 net.go:770] primary dev: ETH0
I0320 15:54:43.423185 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:54:43.423197 543705 net.go:698] Add success.
I0320 15:54:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:54:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:54:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:54:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:54:53.409781 543705 memory.go:184] no items to output this cycle
I0320 15:54:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 15:55:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:55:03.409777 543705 memory.go:184] no items to output this cycle
I0320 15:55:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 15:55:13.409888 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:55:13.409966 543705 cpu.go:282] Add success.
I0320 15:55:13.410041 543705 memory.go:191] Add success.
W0320 15:55:13.410077 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:55:13.410095 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:55:13.410099 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:55:13.419705 543705 net.go:648] Add success.
I0320 15:55:13.422320 543705 net.go:770] primary dev: ETH0
I0320 15:55:13.422333 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:55:13.422345 543705 net.go:698] Add success.
I0320 15:55:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:55:14.455137 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:55:14.455148 543705 disk_worker.go:708] disk space is not compliant
W0320 15:55:14.455150 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:55:14.456478 543705 disk_worker.go:494] system disk:vda1
I0320 15:55:14.456522 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:55:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:55:16.457963 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:55:16.458025 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:55:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:55:16.472376 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:55:23.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:55:23.409760 543705 memory.go:184] no items to output this cycle
I0320 15:55:23.409795 543705 cpu.go:275] no items to output this cycle
I0320 15:55:25.021672 543705 disk_info.go:125] begin check local disk info of client
I0320 15:55:25.024048 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:55:25.024054 543705 disk_info.go:196] parse disk info done, disk is : [0xc000499bc0 0xc000499c00]
E0320 15:55:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:55:33.409792 543705 memory.go:184] no items to output this cycle
I0320 15:55:33.409804 543705 cpu.go:275] no items to output this cycle
E0320 15:55:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:55:43.409792 543705 memory.go:191] Add success.
I0320 15:55:43.409812 543705 cpu.go:282] Add success.
I0320 15:55:43.419946 543705 net.go:648] Add success.
I0320 15:55:43.422562 543705 net.go:770] primary dev: ETH0
I0320 15:55:43.422575 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:55:43.422587 543705 net.go:698] Add success.
I0320 15:55:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:55:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:55:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:55:53.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:55:53.409810 543705 memory.go:184] no items to output this cycle
I0320 15:55:53.409831 543705 cpu.go:275] no items to output this cycle
E0320 15:56:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:56:03.409781 543705 memory.go:184] no items to output this cycle
I0320 15:56:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 15:56:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:56:13.409794 543705 memory.go:191] Add success.
I0320 15:56:13.409794 543705 cpu.go:282] Add success.
W0320 15:56:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:56:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:56:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:56:13.420510 543705 net.go:648] Add success.
I0320 15:56:13.423379 543705 net.go:770] primary dev: ETH0
I0320 15:56:13.423392 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:56:13.423403 543705 net.go:698] Add success.
I0320 15:56:14.453953 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:56:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:56:14.455231 543705 disk_worker.go:708] disk space is not compliant
W0320 15:56:14.455234 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:56:14.456625 543705 disk_worker.go:494] system disk:vda1
I0320 15:56:14.456654 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:56:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:56:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:56:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:56:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:56:16.472405 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:56:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:56:23.409777 543705 memory.go:184] no items to output this cycle
I0320 15:56:23.409780 543705 cpu.go:275] no items to output this cycle
I0320 15:56:25.025674 543705 disk_info.go:125] begin check local disk info of client
I0320 15:56:25.028207 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:56:25.028213 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5800 0xc0000c5840]
E0320 15:56:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:56:33.409781 543705 memory.go:184] no items to output this cycle
I0320 15:56:33.409783 543705 cpu.go:275] no items to output this cycle
E0320 15:56:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:56:43.409791 543705 memory.go:191] Add success.
I0320 15:56:43.409812 543705 cpu.go:282] Add success.
I0320 15:56:43.420017 543705 net.go:648] Add success.
I0320 15:56:43.423099 543705 net.go:770] primary dev: ETH0
I0320 15:56:43.423117 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:56:43.423129 543705 net.go:698] Add success.
I0320 15:56:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:56:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:56:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:56:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:56:53.409768 543705 memory.go:184] no items to output this cycle
I0320 15:56:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 15:57:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:57:03.409770 543705 memory.go:184] no items to output this cycle
I0320 15:57:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 15:57:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:57:13.409793 543705 memory.go:191] Add success.
I0320 15:57:13.409796 543705 cpu.go:282] Add success.
W0320 15:57:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:57:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:57:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:57:13.420182 543705 net.go:648] Add success.
I0320 15:57:13.429367 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 15:57:13.429442 543705 net.go:770] primary dev: ETH0
I0320 15:57:13.429453 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:57:13.429464 543705 net.go:698] Add success.
I0320 15:57:13.453016 543705 event_worker.go:152] Polling the log file for events...
I0320 15:57:13.463495 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9bde6078-2547-4b42-85ec-1523f653adc3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 15:57:13.463525 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 15:57:14.455103 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:57:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0320 15:57:14.455167 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:57:14.456764 543705 disk_worker.go:494] system disk:vda1
I0320 15:57:14.456801 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 15:57:14.457099 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 15:57:14.457107 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 15:57:14.457111 543705 custom_config.go:64] query custom config with name: gpu
E0320 15:57:15.456824 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 15:57:15.456832 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:57:16.457929 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 15:57:16.457929 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 15:57:16.457987 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:57:16.458007 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:57:16.472378 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:57:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:57:23.409778 543705 memory.go:184] no items to output this cycle
I0320 15:57:23.409784 543705 cpu.go:275] no items to output this cycle
I0320 15:57:25.029670 543705 disk_info.go:125] begin check local disk info of client
I0320 15:57:25.032092 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:57:25.032097 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4140 0xc0000c4180]
E0320 15:57:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:57:33.409800 543705 memory.go:184] no items to output this cycle
I0320 15:57:33.409814 543705 cpu.go:275] no items to output this cycle
I0320 15:57:38.537481 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 15:57:38.537491 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 15:57:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:57:43.410606 543705 memory.go:191] Add success.
I0320 15:57:43.409805 543705 cpu.go:282] Add success.
I0320 15:57:43.420301 543705 net.go:648] Add success.
I0320 15:57:43.422817 543705 net.go:770] primary dev: ETH0
I0320 15:57:43.422830 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:57:43.422842 543705 net.go:698] Add success.
I0320 15:57:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:57:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:57:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:57:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:57:53.409788 543705 memory.go:184] no items to output this cycle
I0320 15:57:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 15:58:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:58:03.409783 543705 memory.go:184] no items to output this cycle
I0320 15:58:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 15:58:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:58:13.409817 543705 memory.go:191] Add success.
I0320 15:58:13.409824 543705 cpu.go:282] Add success.
W0320 15:58:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:58:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:58:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:58:13.420151 543705 net.go:648] Add success.
I0320 15:58:13.422847 543705 net.go:770] primary dev: ETH0
I0320 15:58:13.422860 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:58:13.422872 543705 net.go:698] Add success.
I0320 15:58:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:58:14.455142 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:58:14.455152 543705 disk_worker.go:708] disk space is not compliant
W0320 15:58:14.455154 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:58:14.456476 543705 disk_worker.go:494] system disk:vda1
I0320 15:58:14.456520 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:58:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:58:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:58:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:58:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:58:16.472374 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:58:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:58:23.409798 543705 memory.go:184] no items to output this cycle
I0320 15:58:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 15:58:25.033673 543705 disk_info.go:125] begin check local disk info of client
I0320 15:58:25.036191 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:58:25.036198 543705 disk_info.go:196] parse disk info done, disk is : [0xc000487ec0 0xc000487f00]
E0320 15:58:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:58:33.409776 543705 memory.go:184] no items to output this cycle
I0320 15:58:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 15:58:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:58:43.409818 543705 memory.go:191] Add success.
I0320 15:58:43.409825 543705 cpu.go:282] Add success.
I0320 15:58:43.419994 543705 net.go:648] Add success.
I0320 15:58:43.423317 543705 net.go:770] primary dev: ETH0
I0320 15:58:43.423332 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:58:43.423347 543705 net.go:698] Add success.
I0320 15:58:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:58:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:58:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:58:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:58:53.409805 543705 memory.go:184] no items to output this cycle
I0320 15:58:53.409819 543705 cpu.go:275] no items to output this cycle
E0320 15:59:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:59:03.409774 543705 memory.go:184] no items to output this cycle
I0320 15:59:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 15:59:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:59:13.409817 543705 memory.go:191] Add success.
I0320 15:59:13.409827 543705 cpu.go:282] Add success.
W0320 15:59:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 15:59:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 15:59:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 15:59:13.420145 543705 net.go:648] Add success.
I0320 15:59:13.422664 543705 net.go:770] primary dev: ETH0
I0320 15:59:13.422677 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:59:13.422688 543705 net.go:698] Add success.
I0320 15:59:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 15:59:14.455148 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 15:59:14.455159 543705 disk_worker.go:708] disk space is not compliant
W0320 15:59:14.455161 543705 disk_worker.go:728] disk inode is not compliant
I0320 15:59:14.456486 543705 disk_worker.go:494] system disk:vda1
I0320 15:59:14.456541 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 15:59:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 15:59:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:59:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:59:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0320 15:59:16.472381 543705 disk_local_worker.go:436] Get disk info: []
E0320 15:59:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:59:23.409780 543705 memory.go:184] no items to output this cycle
I0320 15:59:23.409780 543705 cpu.go:275] no items to output this cycle
I0320 15:59:25.037672 543705 disk_info.go:125] begin check local disk info of client
I0320 15:59:25.040110 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 15:59:25.040115 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4780 0xc0000c47c0]
E0320 15:59:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:59:33.409794 543705 memory.go:184] no items to output this cycle
I0320 15:59:33.409807 543705 cpu.go:275] no items to output this cycle
E0320 15:59:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:59:43.409790 543705 memory.go:191] Add success.
I0320 15:59:43.409792 543705 cpu.go:282] Add success.
I0320 15:59:43.419995 543705 net.go:648] Add success.
I0320 15:59:43.423304 543705 net.go:770] primary dev: ETH0
I0320 15:59:43.423318 543705 net.go:802] Send network stats successfully!,count is 6
I0320 15:59:43.423330 543705 net.go:698] Add success.
I0320 15:59:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 15:59:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 15:59:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 15:59:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 15:59:53.409790 543705 cpu.go:275] no items to output this cycle
I0320 15:59:53.409793 543705 memory.go:184] no items to output this cycle
E0320 16:00:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:00:03.409886 543705 memory.go:184] no items to output this cycle
I0320 16:00:03.409922 543705 cpu.go:275] no items to output this cycle
E0320 16:00:13.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:00:13.409777 543705 memory.go:191] Add success.
W0320 16:00:13.409802 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 16:00:13.409810 543705 cpu.go:282] Add success.
W0320 16:00:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:00:13.409816 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:00:13.420221 543705 net.go:648] Add success.
I0320 16:00:13.423182 543705 net.go:770] primary dev: ETH0
I0320 16:00:13.423195 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:00:13.423209 543705 net.go:698] Add success.
I0320 16:00:13.469685 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4fe9cd23-4412-4e26-b897-4de3bbd905d0","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 16:00:13.469719 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 16:00:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:00:14.455198 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:00:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0320 16:00:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:00:14.456600 543705 disk_worker.go:494] system disk:vda1
I0320 16:00:14.456630 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:00:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:00:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:00:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:00:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:00:16.472380 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:00:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:00:23.409795 543705 memory.go:184] no items to output this cycle
I0320 16:00:23.409807 543705 cpu.go:275] no items to output this cycle
I0320 16:00:25.041671 543705 disk_info.go:125] begin check local disk info of client
I0320 16:00:25.044121 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:00:25.044126 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4d40 0xc0000c4d80]
E0320 16:00:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:00:33.409770 543705 memory.go:184] no items to output this cycle
I0320 16:00:33.409805 543705 cpu.go:275] no items to output this cycle
I0320 16:00:38.538452 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 16:00:38.538459 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 16:00:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:00:43.411026 543705 memory.go:191] Add success.
I0320 16:00:43.409829 543705 cpu.go:282] Add success.
I0320 16:00:43.419702 543705 net.go:648] Add success.
I0320 16:00:43.422808 543705 net.go:770] primary dev: ETH0
I0320 16:00:43.422822 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:00:43.422836 543705 net.go:698] Add success.
I0320 16:00:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:00:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:00:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:00:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:00:53.409891 543705 memory.go:184] no items to output this cycle
I0320 16:00:53.409914 543705 cpu.go:275] no items to output this cycle
E0320 16:01:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:01:03.409777 543705 memory.go:184] no items to output this cycle
I0320 16:01:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 16:01:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:01:13.409812 543705 memory.go:191] Add success.
I0320 16:01:13.409819 543705 cpu.go:282] Add success.
W0320 16:01:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:01:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:01:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:01:13.420090 543705 net.go:648] Add success.
I0320 16:01:13.422776 543705 net.go:770] primary dev: ETH0
I0320 16:01:13.422789 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:01:13.422801 543705 net.go:698] Add success.
I0320 16:01:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:01:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:01:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 16:01:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:01:14.456582 543705 disk_worker.go:494] system disk:vda1
I0320 16:01:14.456611 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:01:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:01:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:01:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:01:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:01:16.472399 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:01:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:01:23.409778 543705 memory.go:184] no items to output this cycle
I0320 16:01:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 16:01:25.045678 543705 disk_info.go:125] begin check local disk info of client
I0320 16:01:25.048094 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:01:25.048099 543705 disk_info.go:196] parse disk info done, disk is : [0xc000471d40 0xc000471d80]
E0320 16:01:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:01:33.409804 543705 memory.go:184] no items to output this cycle
I0320 16:01:33.409821 543705 cpu.go:275] no items to output this cycle
E0320 16:01:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:01:43.409804 543705 cpu.go:282] Add success.
I0320 16:01:43.409811 543705 memory.go:191] Add success.
I0320 16:01:43.419895 543705 net.go:648] Add success.
I0320 16:01:43.422492 543705 net.go:770] primary dev: ETH0
I0320 16:01:43.422505 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:01:43.422517 543705 net.go:698] Add success.
I0320 16:01:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:01:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:01:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:01:53.410247 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:01:53.410316 543705 memory.go:184] no items to output this cycle
I0320 16:01:53.410429 543705 cpu.go:275] no items to output this cycle
E0320 16:02:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:02:03.409792 543705 memory.go:184] no items to output this cycle
I0320 16:02:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 16:02:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:02:13.409795 543705 memory.go:191] Add success.
I0320 16:02:13.409795 543705 cpu.go:282] Add success.
W0320 16:02:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:02:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:02:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:02:13.420141 543705 net.go:648] Add success.
I0320 16:02:13.422933 543705 net.go:770] primary dev: ETH0
I0320 16:02:13.422946 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:02:13.422958 543705 net.go:698] Add success.
W0320 16:02:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:02:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0320 16:02:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:02:14.456794 543705 disk_worker.go:494] system disk:vda1
I0320 16:02:14.456833 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 16:02:14.457085 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 16:02:14.457093 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 16:02:14.457098 543705 custom_config.go:64] query custom config with name: gpu
E0320 16:02:15.456802 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 16:02:15.456810 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:02:16.457943 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 16:02:16.457942 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 16:02:16.457997 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:02:16.458016 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:02:16.472335 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:02:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:02:23.409809 543705 memory.go:184] no items to output this cycle
I0320 16:02:23.409817 543705 cpu.go:275] no items to output this cycle
I0320 16:02:25.049674 543705 disk_info.go:125] begin check local disk info of client
I0320 16:02:25.052124 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:02:25.052130 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5980 0xc0000c59c0]
E0320 16:02:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:02:33.409803 543705 memory.go:184] no items to output this cycle
I0320 16:02:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 16:02:43.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:02:43.409831 543705 memory.go:191] Add success.
I0320 16:02:43.409834 543705 cpu.go:282] Add success.
I0320 16:02:43.419974 543705 net.go:648] Add success.
I0320 16:02:43.423194 543705 net.go:770] primary dev: ETH0
I0320 16:02:43.423206 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:02:43.423219 543705 net.go:698] Add success.
I0320 16:02:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:02:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:02:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:02:53.409865 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:02:53.409888 543705 memory.go:184] no items to output this cycle
I0320 16:02:53.409969 543705 cpu.go:275] no items to output this cycle
E0320 16:03:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:03:03.409768 543705 memory.go:184] no items to output this cycle
I0320 16:03:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 16:03:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:03:13.409819 543705 memory.go:191] Add success.
I0320 16:03:13.409826 543705 cpu.go:282] Add success.
W0320 16:03:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:03:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:03:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:03:13.420150 543705 net.go:648] Add success.
I0320 16:03:13.422706 543705 net.go:770] primary dev: ETH0
I0320 16:03:13.422719 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:03:13.422730 543705 net.go:698] Add success.
I0320 16:03:13.469007 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4d5ea5c8-01c3-4657-892c-2b621891d0fe","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 16:03:13.469042 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 16:03:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:03:14.455097 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:03:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0320 16:03:14.455165 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:03:14.456503 543705 disk_worker.go:494] system disk:vda1
I0320 16:03:14.456548 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:03:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:03:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:03:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:03:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:03:16.472382 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:03:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:03:23.409794 543705 memory.go:184] no items to output this cycle
I0320 16:03:23.409816 543705 cpu.go:275] no items to output this cycle
I0320 16:03:25.053671 543705 disk_info.go:125] begin check local disk info of client
I0320 16:03:25.056116 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:03:25.056122 543705 disk_info.go:196] parse disk info done, disk is : [0xc0005878c0 0xc000587900]
E0320 16:03:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:03:33.409771 543705 memory.go:184] no items to output this cycle
I0320 16:03:33.409804 543705 cpu.go:275] no items to output this cycle
I0320 16:03:38.539450 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 16:03:38.539457 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 16:03:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:03:43.410902 543705 memory.go:191] Add success.
I0320 16:03:43.409839 543705 cpu.go:282] Add success.
I0320 16:03:43.420583 543705 net.go:648] Add success.
I0320 16:03:43.423366 543705 net.go:770] primary dev: ETH0
I0320 16:03:43.423379 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:03:43.423394 543705 net.go:698] Add success.
I0320 16:03:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:03:46.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:03:46.458056 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:03:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:03:53.409779 543705 memory.go:184] no items to output this cycle
I0320 16:03:53.409799 543705 cpu.go:275] no items to output this cycle
I0320 16:04:03.409910 543705 cpu.go:275] no items to output this cycle
E0320 16:04:03.409930 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:04:03.409951 543705 memory.go:184] no items to output this cycle
E0320 16:04:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:04:13.409785 543705 memory.go:191] Add success.
I0320 16:04:13.409802 543705 cpu.go:282] Add success.
W0320 16:04:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:04:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:04:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:04:13.420183 543705 net.go:648] Add success.
I0320 16:04:13.422873 543705 net.go:770] primary dev: ETH0
I0320 16:04:13.422888 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:04:13.422902 543705 net.go:698] Add success.
I0320 16:04:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:04:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:04:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0320 16:04:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:04:14.456501 543705 disk_worker.go:494] system disk:vda1
I0320 16:04:14.456546 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:04:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:04:16.457963 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:04:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:04:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:04:16.472386 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:04:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:04:23.409786 543705 memory.go:184] no items to output this cycle
I0320 16:04:23.409787 543705 cpu.go:275] no items to output this cycle
I0320 16:04:25.057671 543705 disk_info.go:125] begin check local disk info of client
I0320 16:04:25.060095 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:04:25.060101 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b980 0xc00007b9c0]
E0320 16:04:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:04:33.409806 543705 memory.go:184] no items to output this cycle
I0320 16:04:33.409821 543705 cpu.go:275] no items to output this cycle
E0320 16:04:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:04:43.409802 543705 memory.go:191] Add success.
I0320 16:04:43.409803 543705 cpu.go:282] Add success.
I0320 16:04:43.420058 543705 net.go:648] Add success.
I0320 16:04:43.422759 543705 net.go:770] primary dev: ETH0
I0320 16:04:43.422773 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:04:43.422788 543705 net.go:698] Add success.
I0320 16:04:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:04:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:04:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:04:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:04:53.409770 543705 memory.go:184] no items to output this cycle
I0320 16:04:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 16:05:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:05:03.409771 543705 memory.go:184] no items to output this cycle
I0320 16:05:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 16:05:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:05:13.409813 543705 memory.go:191] Add success.
I0320 16:05:13.409815 543705 cpu.go:282] Add success.
W0320 16:05:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:05:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:05:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:05:13.419766 543705 net.go:648] Add success.
I0320 16:05:13.422648 543705 net.go:770] primary dev: ETH0
I0320 16:05:13.422660 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:05:13.422672 543705 net.go:698] Add success.
I0320 16:05:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:05:14.455086 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:05:14.455145 543705 disk_worker.go:708] disk space is not compliant
W0320 16:05:14.455148 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:05:14.456467 543705 disk_worker.go:494] system disk:vda1
I0320 16:05:14.456511 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:05:15.456018 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:05:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:05:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:05:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:05:16.472377 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:05:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:05:23.409774 543705 memory.go:184] no items to output this cycle
I0320 16:05:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 16:05:25.061670 543705 disk_info.go:125] begin check local disk info of client
I0320 16:05:25.064099 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:05:25.064105 543705 disk_info.go:196] parse disk info done, disk is : [0xc000498980 0xc000498a00]
E0320 16:05:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:05:33.409773 543705 memory.go:184] no items to output this cycle
I0320 16:05:33.409791 543705 cpu.go:275] no items to output this cycle
E0320 16:05:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:05:43.409821 543705 memory.go:191] Add success.
I0320 16:05:43.409822 543705 cpu.go:282] Add success.
I0320 16:05:43.419825 543705 net.go:770] primary dev: ETH0
I0320 16:05:43.419838 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:05:43.419851 543705 net.go:698] Add success.
I0320 16:05:43.420219 543705 net.go:648] Add success.
I0320 16:05:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:05:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:05:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:05:53.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:05:53.409816 543705 memory.go:184] no items to output this cycle
I0320 16:05:53.409823 543705 cpu.go:275] no items to output this cycle
E0320 16:06:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:06:03.409800 543705 memory.go:184] no items to output this cycle
I0320 16:06:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 16:06:13.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:06:13.409778 543705 memory.go:191] Add success.
W0320 16:06:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 16:06:13.409808 543705 cpu.go:282] Add success.
W0320 16:06:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:06:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:06:13.419750 543705 net.go:648] Add success.
I0320 16:06:13.422608 543705 net.go:770] primary dev: ETH0
I0320 16:06:13.422622 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:06:13.422633 543705 net.go:698] Add success.
I0320 16:06:13.469675 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"db252f21-e28a-4f08-9969-6b53c6fe0b7d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 16:06:13.469705 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 16:06:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:06:14.455194 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:06:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0320 16:06:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:06:14.456599 543705 disk_worker.go:494] system disk:vda1
I0320 16:06:14.456628 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:06:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:06:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:06:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:06:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:06:16.472378 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:06:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:06:23.409771 543705 memory.go:184] no items to output this cycle
I0320 16:06:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 16:06:25.065673 543705 disk_info.go:125] begin check local disk info of client
I0320 16:06:25.068100 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:06:25.068105 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa400 0xc0001aa480]
E0320 16:06:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:06:33.409774 543705 memory.go:184] no items to output this cycle
I0320 16:06:33.409795 543705 cpu.go:275] no items to output this cycle
I0320 16:06:38.540457 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 16:06:38.540464 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 16:06:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:06:43.410762 543705 memory.go:191] Add success.
I0320 16:06:43.409802 543705 cpu.go:282] Add success.
I0320 16:06:43.420472 543705 net.go:648] Add success.
I0320 16:06:43.423140 543705 net.go:770] primary dev: ETH0
I0320 16:06:43.423154 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:06:43.423167 543705 net.go:698] Add success.
I0320 16:06:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:06:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:06:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:06:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:06:53.409780 543705 memory.go:184] no items to output this cycle
I0320 16:06:53.409782 543705 cpu.go:275] no items to output this cycle
E0320 16:07:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:07:03.409784 543705 memory.go:184] no items to output this cycle
I0320 16:07:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 16:07:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:07:13.409810 543705 memory.go:191] Add success.
I0320 16:07:13.409816 543705 cpu.go:282] Add success.
W0320 16:07:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:07:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:07:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:07:13.420300 543705 net.go:648] Add success.
I0320 16:07:13.422924 543705 net.go:770] primary dev: ETH0
I0320 16:07:13.422939 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:07:13.422953 543705 net.go:698] Add success.
I0320 16:07:13.452771 543705 event_worker.go:152] Polling the log file for events...
W0320 16:07:14.455129 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:07:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0320 16:07:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:07:14.456784 543705 disk_worker.go:494] system disk:vda1
I0320 16:07:14.456821 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 16:07:14.456990 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 16:07:14.456999 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 16:07:14.457004 543705 custom_config.go:64] query custom config with name: gpu
E0320 16:07:15.456681 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 16:07:15.456690 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:07:16.457939 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 16:07:16.457938 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 16:07:16.457993 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:07:16.458015 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:07:16.472396 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:07:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:07:23.409793 543705 memory.go:184] no items to output this cycle
I0320 16:07:23.409802 543705 cpu.go:275] no items to output this cycle
I0320 16:07:25.069673 543705 disk_info.go:125] begin check local disk info of client
I0320 16:07:25.072056 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:07:25.072062 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003d8a40 0xc0003d8a80]
E0320 16:07:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:07:33.409768 543705 memory.go:184] no items to output this cycle
I0320 16:07:33.409809 543705 cpu.go:275] no items to output this cycle
E0320 16:07:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:07:43.409803 543705 memory.go:191] Add success.
I0320 16:07:43.409805 543705 cpu.go:282] Add success.
I0320 16:07:43.419974 543705 net.go:648] Add success.
I0320 16:07:43.422587 543705 net.go:770] primary dev: ETH0
I0320 16:07:43.422601 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:07:43.422618 543705 net.go:698] Add success.
I0320 16:07:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:07:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:07:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:07:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:07:53.409788 543705 memory.go:184] no items to output this cycle
I0320 16:07:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 16:08:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:08:03.409782 543705 memory.go:184] no items to output this cycle
I0320 16:08:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 16:08:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:08:13.409792 543705 memory.go:191] Add success.
I0320 16:08:13.409797 543705 cpu.go:282] Add success.
W0320 16:08:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:08:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:08:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:08:13.420068 543705 net.go:648] Add success.
I0320 16:08:13.422954 543705 net.go:770] primary dev: ETH0
I0320 16:08:13.422969 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:08:13.422984 543705 net.go:698] Add success.
I0320 16:08:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:08:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:08:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0320 16:08:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:08:14.456594 543705 disk_worker.go:494] system disk:vda1
I0320 16:08:14.456621 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:08:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:08:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:08:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:08:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:08:16.472441 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:08:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:08:23.409781 543705 memory.go:184] no items to output this cycle
I0320 16:08:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 16:08:25.073672 543705 disk_info.go:125] begin check local disk info of client
I0320 16:08:25.076131 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:08:25.076137 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003dcd40 0xc0003dcd80]
E0320 16:08:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:08:33.409786 543705 memory.go:184] no items to output this cycle
I0320 16:08:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 16:08:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:08:43.409788 543705 memory.go:191] Add success.
I0320 16:08:43.409818 543705 cpu.go:282] Add success.
I0320 16:08:43.419865 543705 net.go:648] Add success.
I0320 16:08:43.422875 543705 net.go:770] primary dev: ETH0
I0320 16:08:43.422888 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:08:43.422901 543705 net.go:698] Add success.
I0320 16:08:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:08:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:08:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:08:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:08:53.409767 543705 memory.go:184] no items to output this cycle
I0320 16:08:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 16:09:03.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:09:03.409814 543705 memory.go:184] no items to output this cycle
I0320 16:09:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 16:09:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:09:13.409812 543705 memory.go:191] Add success.
I0320 16:09:13.409818 543705 cpu.go:282] Add success.
W0320 16:09:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:09:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:09:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:09:13.420098 543705 net.go:648] Add success.
I0320 16:09:13.422567 543705 net.go:770] primary dev: ETH0
I0320 16:09:13.422579 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:09:13.422592 543705 net.go:698] Add success.
I0320 16:09:13.468367 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4ab55eec-13ef-40aa-88d9-afea44b473cf","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 16:09:13.468399 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 16:09:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:09:14.455315 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:09:14.455413 543705 disk_worker.go:708] disk space is not compliant
W0320 16:09:14.455418 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:09:14.457000 543705 disk_worker.go:494] system disk:vda1
I0320 16:09:14.457030 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:09:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:09:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:09:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:09:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:09:16.472395 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:09:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:09:23.409796 543705 memory.go:184] no items to output this cycle
I0320 16:09:23.409806 543705 cpu.go:275] no items to output this cycle
I0320 16:09:25.077677 543705 disk_info.go:125] begin check local disk info of client
I0320 16:09:25.080190 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:09:25.080196 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004ac080 0xc0004ac0c0]
E0320 16:09:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:09:33.409788 543705 memory.go:184] no items to output this cycle
I0320 16:09:33.409799 543705 cpu.go:275] no items to output this cycle
I0320 16:09:38.541450 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 16:09:38.541457 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 16:09:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:09:43.410707 543705 memory.go:191] Add success.
I0320 16:09:43.409824 543705 cpu.go:282] Add success.
I0320 16:09:43.420426 543705 net.go:648] Add success.
I0320 16:09:43.423364 543705 net.go:770] primary dev: ETH0
I0320 16:09:43.423377 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:09:43.423390 543705 net.go:698] Add success.
I0320 16:09:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:09:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:09:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:09:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:09:53.409781 543705 memory.go:184] no items to output this cycle
I0320 16:09:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 16:10:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:10:03.409794 543705 memory.go:184] no items to output this cycle
I0320 16:10:03.409804 543705 cpu.go:275] no items to output this cycle
W0320 16:10:13.409714 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:10:13.409736 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:10:13.409742 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:10:13.409835 543705 cpu.go:282] Add success.
E0320 16:10:13.409838 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:10:13.409856 543705 memory.go:191] Add success.
I0320 16:10:13.420048 543705 net.go:648] Add success.
I0320 16:10:13.422595 543705 net.go:770] primary dev: ETH0
I0320 16:10:13.422608 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:10:13.422620 543705 net.go:698] Add success.
I0320 16:10:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:10:14.455160 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:10:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 16:10:14.455173 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:10:14.456481 543705 disk_worker.go:494] system disk:vda1
I0320 16:10:14.456509 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:10:15.455950 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:10:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:10:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:10:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:10:16.472378 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:10:23.410235 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:10:23.410251 543705 memory.go:184] no items to output this cycle
I0320 16:10:23.410275 543705 cpu.go:275] no items to output this cycle
I0320 16:10:25.081676 543705 disk_info.go:125] begin check local disk info of client
I0320 16:10:25.084123 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:10:25.084129 543705 disk_info.go:196] parse disk info done, disk is : [0xc000315140 0xc000315180]
E0320 16:10:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:10:33.409779 543705 memory.go:184] no items to output this cycle
I0320 16:10:33.409800 543705 cpu.go:275] no items to output this cycle
E0320 16:10:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:10:43.409819 543705 memory.go:191] Add success.
I0320 16:10:43.409828 543705 cpu.go:282] Add success.
I0320 16:10:43.420096 543705 net.go:648] Add success.
I0320 16:10:43.423605 543705 net.go:770] primary dev: ETH0
I0320 16:10:43.423619 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:10:43.423632 543705 net.go:698] Add success.
I0320 16:10:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:10:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:10:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:10:53.410410 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:10:53.410429 543705 memory.go:184] no items to output this cycle
I0320 16:10:53.410439 543705 cpu.go:275] no items to output this cycle
E0320 16:11:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:11:03.409775 543705 memory.go:184] no items to output this cycle
I0320 16:11:03.409796 543705 cpu.go:275] no items to output this cycle
W0320 16:11:13.409710 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:11:13.409727 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:11:13.409731 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:11:13.409800 543705 cpu.go:282] Add success.
E0320 16:11:13.409811 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:11:13.409831 543705 memory.go:191] Add success.
I0320 16:11:13.420246 543705 net.go:648] Add success.
I0320 16:11:13.422909 543705 net.go:770] primary dev: ETH0
I0320 16:11:13.422924 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:11:13.422936 543705 net.go:698] Add success.
I0320 16:11:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:11:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:11:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 16:11:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:11:14.456570 543705 disk_worker.go:494] system disk:vda1
I0320 16:11:14.456601 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:11:15.456011 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:11:16.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:11:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:11:16.458077 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:11:16.472407 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:11:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:11:23.409794 543705 memory.go:184] no items to output this cycle
I0320 16:11:23.409805 543705 cpu.go:275] no items to output this cycle
I0320 16:11:25.085671 543705 disk_info.go:125] begin check local disk info of client
I0320 16:11:25.088114 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:11:25.088120 543705 disk_info.go:196] parse disk info done, disk is : [0xc000461b80 0xc000461bc0]
E0320 16:11:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:11:33.409803 543705 memory.go:184] no items to output this cycle
I0320 16:11:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 16:11:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:11:43.409782 543705 memory.go:191] Add success.
I0320 16:11:43.409814 543705 cpu.go:282] Add success.
I0320 16:11:43.419885 543705 net.go:648] Add success.
I0320 16:11:43.422554 543705 net.go:770] primary dev: ETH0
I0320 16:11:43.422579 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:11:43.422592 543705 net.go:698] Add success.
I0320 16:11:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:11:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:11:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:11:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:11:53.409781 543705 memory.go:184] no items to output this cycle
I0320 16:11:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 16:12:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:12:03.409781 543705 memory.go:184] no items to output this cycle
I0320 16:12:03.409786 543705 cpu.go:275] no items to output this cycle
W0320 16:12:13.409710 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:12:13.409732 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:12:13.409738 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0320 16:12:13.409830 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:12:13.409840 543705 cpu.go:282] Add success.
I0320 16:12:13.409850 543705 memory.go:191] Add success.
I0320 16:12:13.419871 543705 net.go:770] primary dev: ETH0
I0320 16:12:13.419883 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:12:13.419897 543705 net.go:698] Add success.
I0320 16:12:13.420121 543705 net.go:648] Add success.
I0320 16:12:13.908582 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"bfe2a3c7-eaa7-4540-94f5-d4c6afae503b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 16:12:13.908625 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 16:12:14.454253 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:12:14.454264 543705 disk_worker.go:708] disk space is not compliant
W0320 16:12:14.454266 543705 disk_worker.go:728] disk inode is not compliant
E0320 16:12:14.455610 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 16:12:14.455619 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 16:12:14.455635 543705 custom_config.go:64] query custom config with name: gpu
I0320 16:12:14.456238 543705 disk_worker.go:494] system disk:vda1
I0320 16:12:14.456268 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 16:12:15.457003 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 16:12:15.457012 543705 custom_config.go:64] query custom config with name: huawei_npu
E0320 16:12:16.457928 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 16:12:16.457932 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:12:16.457986 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:12:16.458005 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:12:16.472348 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:12:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:12:23.409795 543705 memory.go:184] no items to output this cycle
I0320 16:12:23.409808 543705 cpu.go:275] no items to output this cycle
I0320 16:12:25.089679 543705 disk_info.go:125] begin check local disk info of client
I0320 16:12:25.092214 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:12:25.092222 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa000 0xc0001aa040]
E0320 16:12:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:12:33.409802 543705 memory.go:184] no items to output this cycle
I0320 16:12:33.409813 543705 cpu.go:275] no items to output this cycle
I0320 16:12:38.542462 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 16:12:38.542468 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 16:12:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:12:43.410721 543705 memory.go:191] Add success.
I0320 16:12:43.409800 543705 cpu.go:282] Add success.
I0320 16:12:43.420502 543705 net.go:648] Add success.
I0320 16:12:43.423175 543705 net.go:770] primary dev: ETH0
I0320 16:12:43.423190 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:12:43.423205 543705 net.go:698] Add success.
I0320 16:12:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:12:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:12:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:12:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:12:53.409797 543705 memory.go:184] no items to output this cycle
I0320 16:12:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 16:13:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:13:03.409790 543705 memory.go:184] no items to output this cycle
I0320 16:13:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 16:13:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:13:13.409808 543705 memory.go:191] Add success.
I0320 16:13:13.409818 543705 cpu.go:282] Add success.
W0320 16:13:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:13:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:13:13.409854 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:13:13.420136 543705 net.go:648] Add success.
I0320 16:13:13.423430 543705 net.go:770] primary dev: ETH0
I0320 16:13:13.423443 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:13:13.423455 543705 net.go:698] Add success.
I0320 16:13:14.454947 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:13:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:13:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 16:13:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:13:14.456564 543705 disk_worker.go:494] system disk:vda1
I0320 16:13:14.456592 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:13:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:13:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:13:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:13:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:13:16.472449 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:13:23.409867 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:13:23.409887 543705 memory.go:184] no items to output this cycle
I0320 16:13:23.409947 543705 cpu.go:275] no items to output this cycle
I0320 16:13:25.093675 543705 disk_info.go:125] begin check local disk info of client
I0320 16:13:25.096110 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:13:25.096117 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa000 0xc0001fa040]
E0320 16:13:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:13:33.409787 543705 memory.go:184] no items to output this cycle
I0320 16:13:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 16:13:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:13:43.409810 543705 memory.go:191] Add success.
I0320 16:13:43.409815 543705 cpu.go:282] Add success.
I0320 16:13:43.419913 543705 net.go:648] Add success.
I0320 16:13:43.422714 543705 net.go:770] primary dev: ETH0
I0320 16:13:43.422726 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:13:43.422739 543705 net.go:698] Add success.
I0320 16:13:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:13:46.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:13:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:13:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:13:53.409794 543705 memory.go:184] no items to output this cycle
I0320 16:13:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 16:14:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:14:03.409794 543705 memory.go:184] no items to output this cycle
I0320 16:14:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 16:14:13.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:14:13.409825 543705 memory.go:191] Add success.
I0320 16:14:13.409828 543705 cpu.go:282] Add success.
W0320 16:14:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:14:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:14:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:14:13.420110 543705 net.go:648] Add success.
I0320 16:14:13.422942 543705 net.go:770] primary dev: ETH0
I0320 16:14:13.422954 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:14:13.422967 543705 net.go:698] Add success.
I0320 16:14:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:14:14.455137 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:14:14.455223 543705 disk_worker.go:708] disk space is not compliant
W0320 16:14:14.455227 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:14:14.456604 543705 disk_worker.go:494] system disk:vda1
I0320 16:14:14.456633 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:14:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:14:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:14:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:14:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:14:16.472434 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:14:23.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:14:23.409912 543705 memory.go:184] no items to output this cycle
I0320 16:14:23.409921 543705 cpu.go:275] no items to output this cycle
I0320 16:14:25.097674 543705 disk_info.go:125] begin check local disk info of client
I0320 16:14:25.100207 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:14:25.100213 543705 disk_info.go:196] parse disk info done, disk is : [0xc000304000 0xc000304040]
E0320 16:14:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:14:33.409814 543705 memory.go:184] no items to output this cycle
I0320 16:14:33.409830 543705 cpu.go:275] no items to output this cycle
E0320 16:14:43.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:14:43.409830 543705 memory.go:191] Add success.
I0320 16:14:43.409834 543705 cpu.go:282] Add success.
I0320 16:14:43.420020 543705 net.go:648] Add success.
I0320 16:14:43.422604 543705 net.go:770] primary dev: ETH0
I0320 16:14:43.422617 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:14:43.422629 543705 net.go:698] Add success.
I0320 16:14:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:14:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:14:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:14:53.410232 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:14:53.410247 543705 memory.go:184] no items to output this cycle
I0320 16:14:53.410257 543705 cpu.go:275] no items to output this cycle
E0320 16:15:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:15:03.409816 543705 memory.go:184] no items to output this cycle
I0320 16:15:03.409825 543705 cpu.go:275] no items to output this cycle
E0320 16:15:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:15:13.409786 543705 memory.go:191] Add success.
I0320 16:15:13.409788 543705 cpu.go:282] Add success.
W0320 16:15:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:15:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:15:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:15:13.420053 543705 net.go:648] Add success.
I0320 16:15:13.422960 543705 net.go:770] primary dev: ETH0
I0320 16:15:13.422973 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:15:13.422985 543705 net.go:698] Add success.
I0320 16:15:13.470270 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"621884d8-f88c-4ea7-93af-ea78bc1e24b2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 16:15:13.470305 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 16:15:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:15:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:15:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0320 16:15:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:15:14.456673 543705 disk_worker.go:494] system disk:vda1
I0320 16:15:14.456713 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:15:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:15:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:15:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:15:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:15:16.472091 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:15:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:15:23.409784 543705 memory.go:184] no items to output this cycle
I0320 16:15:23.409784 543705 cpu.go:275] no items to output this cycle
I0320 16:15:25.101672 543705 disk_info.go:125] begin check local disk info of client
I0320 16:15:25.104200 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:15:25.104205 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b9100 0xc0003b9140]
E0320 16:15:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:15:33.409784 543705 memory.go:184] no items to output this cycle
I0320 16:15:33.409788 543705 cpu.go:275] no items to output this cycle
I0320 16:15:38.543470 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 16:15:38.543476 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 16:15:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:15:43.410666 543705 memory.go:191] Add success.
I0320 16:15:43.409799 543705 cpu.go:282] Add success.
I0320 16:15:43.420378 543705 net.go:648] Add success.
I0320 16:15:43.423424 543705 net.go:770] primary dev: ETH0
I0320 16:15:43.423437 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:15:43.423450 543705 net.go:698] Add success.
I0320 16:15:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:15:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:15:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:15:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:15:53.409786 543705 memory.go:184] no items to output this cycle
I0320 16:15:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 16:16:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:16:03.409807 543705 memory.go:184] no items to output this cycle
I0320 16:16:03.409819 543705 cpu.go:275] no items to output this cycle
E0320 16:16:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:16:13.409780 543705 memory.go:191] Add success.
I0320 16:16:13.409805 543705 cpu.go:282] Add success.
W0320 16:16:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:16:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:16:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:16:13.420191 543705 net.go:648] Add success.
I0320 16:16:13.423288 543705 net.go:770] primary dev: ETH0
I0320 16:16:13.423304 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:16:13.423324 543705 net.go:698] Add success.
I0320 16:16:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:16:14.455197 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:16:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0320 16:16:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:16:14.456582 543705 disk_worker.go:494] system disk:vda1
I0320 16:16:14.456613 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:16:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:16:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:16:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:16:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:16:16.472402 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:16:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:16:23.409769 543705 memory.go:184] no items to output this cycle
I0320 16:16:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 16:16:25.105675 543705 disk_info.go:125] begin check local disk info of client
I0320 16:16:25.108167 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:16:25.108173 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032f980 0xc00032f9c0]
E0320 16:16:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:16:33.409795 543705 memory.go:184] no items to output this cycle
I0320 16:16:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 16:16:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:16:43.409792 543705 memory.go:191] Add success.
I0320 16:16:43.409831 543705 cpu.go:282] Add success.
I0320 16:16:43.419884 543705 net.go:648] Add success.
I0320 16:16:43.422803 543705 net.go:770] primary dev: ETH0
I0320 16:16:43.422816 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:16:43.422829 543705 net.go:698] Add success.
I0320 16:16:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:16:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:16:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:16:53.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:16:53.409763 543705 memory.go:184] no items to output this cycle
I0320 16:16:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 16:17:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:17:03.409782 543705 memory.go:184] no items to output this cycle
I0320 16:17:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 16:17:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:17:13.409810 543705 memory.go:191] Add success.
I0320 16:17:13.409820 543705 cpu.go:282] Add success.
W0320 16:17:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:17:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:17:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:17:13.420124 543705 net.go:648] Add success.
I0320 16:17:13.422802 543705 net.go:770] primary dev: ETH0
I0320 16:17:13.422816 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:17:13.422830 543705 net.go:698] Add success.
I0320 16:17:13.453396 543705 event_worker.go:152] Polling the log file for events...
W0320 16:17:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:17:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 16:17:14.455201 543705 disk_worker.go:728] disk inode is not compliant
E0320 16:17:14.456434 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 16:17:14.456444 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 16:17:14.456450 543705 custom_config.go:64] query custom config with name: gpu
I0320 16:17:14.456947 543705 disk_worker.go:494] system disk:vda1
I0320 16:17:14.457005 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 16:17:15.456784 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 16:17:15.456792 543705 custom_config.go:64] query custom config with name: huawei_npu
E0320 16:17:16.457946 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 16:17:16.457952 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:17:16.458015 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:17:16.458035 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:17:16.472352 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:17:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:17:23.409803 543705 memory.go:184] no items to output this cycle
I0320 16:17:23.409815 543705 cpu.go:275] no items to output this cycle
I0320 16:17:25.109674 543705 disk_info.go:125] begin check local disk info of client
I0320 16:17:25.112209 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:17:25.112214 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa000 0xc0001aa040]
E0320 16:17:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:17:33.409801 543705 memory.go:184] no items to output this cycle
I0320 16:17:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 16:17:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:17:43.409823 543705 memory.go:191] Add success.
I0320 16:17:43.409829 543705 cpu.go:282] Add success.
I0320 16:17:43.419969 543705 net.go:648] Add success.
I0320 16:17:43.423135 543705 net.go:770] primary dev: ETH0
I0320 16:17:43.423148 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:17:43.423160 543705 net.go:698] Add success.
I0320 16:17:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:17:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:17:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:17:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:17:53.409795 543705 memory.go:184] no items to output this cycle
I0320 16:17:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 16:18:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:18:03.409801 543705 memory.go:184] no items to output this cycle
I0320 16:18:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 16:18:13.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:18:13.409774 543705 memory.go:191] Add success.
W0320 16:18:13.409800 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 16:18:13.409807 543705 cpu.go:282] Add success.
W0320 16:18:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:18:13.409815 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:18:13.420041 543705 net.go:648] Add success.
I0320 16:18:13.423094 543705 net.go:770] primary dev: ETH0
I0320 16:18:13.423107 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:18:13.423119 543705 net.go:698] Add success.
I0320 16:18:13.971391 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7b748109-66ad-4ddf-9208-fec9aabe0f47","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 16:18:13.971427 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 16:18:14.453974 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:18:14.455279 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:18:14.455289 543705 disk_worker.go:708] disk space is not compliant
W0320 16:18:14.455292 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:18:14.457056 543705 disk_worker.go:494] system disk:vda1
I0320 16:18:14.457085 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:18:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:18:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:18:16.458025 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:18:16.458044 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:18:16.472380 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:18:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:18:23.409791 543705 memory.go:184] no items to output this cycle
I0320 16:18:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 16:18:25.113674 543705 disk_info.go:125] begin check local disk info of client
I0320 16:18:25.116166 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:18:25.116172 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa3c0 0xc0001fa400]
E0320 16:18:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:18:33.409797 543705 memory.go:184] no items to output this cycle
I0320 16:18:33.409807 543705 cpu.go:275] no items to output this cycle
I0320 16:18:38.544472 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 16:18:38.544478 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 16:18:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:18:43.410847 543705 memory.go:191] Add success.
I0320 16:18:43.409811 543705 cpu.go:282] Add success.
I0320 16:18:43.420565 543705 net.go:648] Add success.
I0320 16:18:43.423360 543705 net.go:770] primary dev: ETH0
I0320 16:18:43.423376 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:18:43.423392 543705 net.go:698] Add success.
I0320 16:18:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:18:46.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:18:46.458054 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:18:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:18:53.409768 543705 memory.go:184] no items to output this cycle
I0320 16:18:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 16:19:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:19:03.409809 543705 memory.go:184] no items to output this cycle
I0320 16:19:03.409822 543705 cpu.go:275] no items to output this cycle
E0320 16:19:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:19:13.409780 543705 memory.go:191] Add success.
W0320 16:19:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 16:19:13.409811 543705 cpu.go:282] Add success.
W0320 16:19:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:19:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:19:13.420112 543705 net.go:648] Add success.
I0320 16:19:13.422751 543705 net.go:770] primary dev: ETH0
I0320 16:19:13.422766 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:19:13.422780 543705 net.go:698] Add success.
I0320 16:19:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:19:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:19:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0320 16:19:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:19:14.456528 543705 disk_worker.go:494] system disk:vda1
I0320 16:19:14.456571 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:19:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:19:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:19:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:19:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:19:16.472377 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:19:23.409885 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:19:23.409906 543705 memory.go:184] no items to output this cycle
I0320 16:19:23.409961 543705 cpu.go:275] no items to output this cycle
I0320 16:19:25.117671 543705 disk_info.go:125] begin check local disk info of client
I0320 16:19:25.120273 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:19:25.120279 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032e000 0xc00032e040]
E0320 16:19:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:19:33.409774 543705 memory.go:184] no items to output this cycle
I0320 16:19:33.409785 543705 cpu.go:275] no items to output this cycle
E0320 16:19:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:19:43.409798 543705 memory.go:191] Add success.
I0320 16:19:43.409806 543705 cpu.go:282] Add success.
I0320 16:19:43.419962 543705 net.go:648] Add success.
I0320 16:19:43.422676 543705 net.go:770] primary dev: ETH0
I0320 16:19:43.422689 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:19:43.422702 543705 net.go:698] Add success.
I0320 16:19:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:19:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:19:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:19:53.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:19:53.409817 543705 memory.go:184] no items to output this cycle
I0320 16:19:53.409823 543705 cpu.go:275] no items to output this cycle
E0320 16:20:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:20:03.409773 543705 memory.go:184] no items to output this cycle
I0320 16:20:03.409778 543705 cpu.go:275] no items to output this cycle
E0320 16:20:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:20:13.409779 543705 memory.go:191] Add success.
W0320 16:20:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 16:20:13.409810 543705 cpu.go:282] Add success.
W0320 16:20:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:20:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:20:13.420192 543705 net.go:648] Add success.
I0320 16:20:13.423263 543705 net.go:770] primary dev: ETH0
I0320 16:20:13.423277 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:20:13.423291 543705 net.go:698] Add success.
I0320 16:20:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:20:14.455105 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:20:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 16:20:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:20:14.456576 543705 disk_worker.go:494] system disk:vda1
I0320 16:20:14.456608 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:20:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:20:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:20:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:20:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:20:16.472425 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:20:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:20:23.409778 543705 memory.go:184] no items to output this cycle
I0320 16:20:23.409800 543705 cpu.go:275] no items to output this cycle
I0320 16:20:25.121673 543705 disk_info.go:125] begin check local disk info of client
I0320 16:20:25.124192 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:20:25.124199 543705 disk_info.go:196] parse disk info done, disk is : [0xc000272000 0xc000272040]
E0320 16:20:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:20:33.409804 543705 memory.go:184] no items to output this cycle
I0320 16:20:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 16:20:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:20:43.409819 543705 memory.go:191] Add success.
I0320 16:20:43.409832 543705 cpu.go:282] Add success.
I0320 16:20:43.420004 543705 net.go:648] Add success.
I0320 16:20:43.422670 543705 net.go:770] primary dev: ETH0
I0320 16:20:43.422685 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:20:43.422697 543705 net.go:698] Add success.
I0320 16:20:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:20:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:20:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:20:53.410243 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:20:53.410258 543705 memory.go:184] no items to output this cycle
I0320 16:20:53.410289 543705 cpu.go:275] no items to output this cycle
E0320 16:21:03.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:21:03.409810 543705 memory.go:184] no items to output this cycle
I0320 16:21:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 16:21:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:21:13.409817 543705 memory.go:191] Add success.
I0320 16:21:13.409822 543705 cpu.go:282] Add success.
W0320 16:21:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:21:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:21:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:21:13.420077 543705 net.go:648] Add success.
I0320 16:21:13.422644 543705 net.go:770] primary dev: ETH0
I0320 16:21:13.422659 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:21:13.422671 543705 net.go:698] Add success.
I0320 16:21:13.463413 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"60e92f49-324b-4d27-965f-0d50f0510254","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 16:21:13.463447 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 16:21:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:21:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:21:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 16:21:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:21:14.456530 543705 disk_worker.go:494] system disk:vda1
I0320 16:21:14.456576 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:21:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:21:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:21:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:21:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:21:16.472486 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:21:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:21:23.409768 543705 memory.go:184] no items to output this cycle
I0320 16:21:23.409808 543705 cpu.go:275] no items to output this cycle
I0320 16:21:25.125673 543705 disk_info.go:125] begin check local disk info of client
I0320 16:21:25.128240 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:21:25.128246 543705 disk_info.go:196] parse disk info done, disk is : [0xc000396400 0xc000396440]
E0320 16:21:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:21:33.409782 543705 memory.go:184] no items to output this cycle
I0320 16:21:33.409786 543705 cpu.go:275] no items to output this cycle
I0320 16:21:38.545489 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 16:21:38.545495 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 16:21:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:21:43.410501 543705 memory.go:191] Add success.
I0320 16:21:43.409832 543705 cpu.go:282] Add success.
I0320 16:21:43.420392 543705 net.go:648] Add success.
I0320 16:21:43.422924 543705 net.go:770] primary dev: ETH0
I0320 16:21:43.422940 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:21:43.422954 543705 net.go:698] Add success.
I0320 16:21:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:21:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:21:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:21:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:21:53.409785 543705 memory.go:184] no items to output this cycle
I0320 16:21:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 16:22:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:22:03.409796 543705 memory.go:184] no items to output this cycle
I0320 16:22:03.409809 543705 cpu.go:275] no items to output this cycle
E0320 16:22:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:22:13.409815 543705 memory.go:191] Add success.
I0320 16:22:13.409827 543705 cpu.go:282] Add success.
W0320 16:22:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:22:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:22:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:22:13.420283 543705 net.go:648] Add success.
I0320 16:22:13.422937 543705 net.go:770] primary dev: ETH0
I0320 16:22:13.422952 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:22:13.422965 543705 net.go:698] Add success.
W0320 16:22:14.455094 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:22:14.455156 543705 disk_worker.go:708] disk space is not compliant
W0320 16:22:14.455159 543705 disk_worker.go:728] disk inode is not compliant
E0320 16:22:14.456931 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 16:22:14.456941 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 16:22:14.456947 543705 custom_config.go:64] query custom config with name: gpu
I0320 16:22:14.456993 543705 disk_worker.go:494] system disk:vda1
I0320 16:22:14.457034 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 16:22:15.456926 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 16:22:15.456940 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:22:16.457949 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 16:22:16.457948 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 16:22:16.458006 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:22:16.458026 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:22:16.472397 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:22:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:22:23.409806 543705 memory.go:184] no items to output this cycle
I0320 16:22:23.409817 543705 cpu.go:275] no items to output this cycle
I0320 16:22:25.129671 543705 disk_info.go:125] begin check local disk info of client
I0320 16:22:25.132115 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:22:25.132121 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032f680 0xc00032f6c0]
E0320 16:22:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:22:33.409767 543705 memory.go:184] no items to output this cycle
I0320 16:22:33.409789 543705 cpu.go:275] no items to output this cycle
E0320 16:22:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:22:43.409803 543705 memory.go:191] Add success.
I0320 16:22:43.409805 543705 cpu.go:282] Add success.
I0320 16:22:43.419975 543705 net.go:648] Add success.
I0320 16:22:43.423086 543705 net.go:770] primary dev: ETH0
I0320 16:22:43.423100 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:22:43.423118 543705 net.go:698] Add success.
I0320 16:22:46.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:22:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:22:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:22:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:22:53.409778 543705 cpu.go:275] no items to output this cycle
I0320 16:22:53.409781 543705 memory.go:184] no items to output this cycle
E0320 16:23:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:23:03.409805 543705 memory.go:184] no items to output this cycle
I0320 16:23:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 16:23:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:23:13.409794 543705 memory.go:191] Add success.
I0320 16:23:13.409795 543705 cpu.go:282] Add success.
W0320 16:23:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:23:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:23:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:23:13.420123 543705 net.go:648] Add success.
I0320 16:23:13.423209 543705 net.go:770] primary dev: ETH0
I0320 16:23:13.423226 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:23:13.423240 543705 net.go:698] Add success.
I0320 16:23:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:23:14.455198 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:23:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0320 16:23:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:23:14.456627 543705 disk_worker.go:494] system disk:vda1
I0320 16:23:14.456660 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:23:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:23:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:23:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:23:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:23:16.472384 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:23:23.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:23:23.409807 543705 memory.go:184] no items to output this cycle
I0320 16:23:23.409819 543705 cpu.go:275] no items to output this cycle
I0320 16:23:25.133677 543705 disk_info.go:125] begin check local disk info of client
I0320 16:23:25.136123 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:23:25.136129 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fd540 0xc0001fd580]
E0320 16:23:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:23:33.409775 543705 memory.go:184] no items to output this cycle
I0320 16:23:33.409781 543705 cpu.go:275] no items to output this cycle
E0320 16:23:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:23:43.409792 543705 memory.go:191] Add success.
I0320 16:23:43.409810 543705 cpu.go:282] Add success.
I0320 16:23:43.419955 543705 net.go:648] Add success.
I0320 16:23:43.422650 543705 net.go:770] primary dev: ETH0
I0320 16:23:43.422665 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:23:43.422680 543705 net.go:698] Add success.
I0320 16:23:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:23:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:23:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:23:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:23:53.409793 543705 memory.go:184] no items to output this cycle
I0320 16:23:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 16:24:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:24:03.409784 543705 memory.go:184] no items to output this cycle
I0320 16:24:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 16:24:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:24:13.409829 543705 memory.go:191] Add success.
I0320 16:24:13.409840 543705 cpu.go:282] Add success.
W0320 16:24:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:24:13.409878 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:24:13.409882 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:24:13.420421 543705 net.go:648] Add success.
I0320 16:24:13.423733 543705 net.go:770] primary dev: ETH0
I0320 16:24:13.423751 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:24:13.423769 543705 net.go:698] Add success.
I0320 16:24:13.468249 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4ee83008-9af2-47f7-85c2-a6f8adc50191","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 16:24:13.468284 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 16:24:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:24:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:24:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 16:24:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:24:14.456590 543705 disk_worker.go:494] system disk:vda1
I0320 16:24:14.456633 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:24:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:24:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:24:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:24:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:24:16.472424 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:24:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:24:23.409783 543705 memory.go:184] no items to output this cycle
I0320 16:24:23.409784 543705 cpu.go:275] no items to output this cycle
I0320 16:24:25.137674 543705 disk_info.go:125] begin check local disk info of client
I0320 16:24:25.140100 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:24:25.140106 543705 disk_info.go:196] parse disk info done, disk is : [0xc000470040 0xc000470080]
E0320 16:24:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:24:33.409787 543705 memory.go:184] no items to output this cycle
I0320 16:24:33.409796 543705 cpu.go:275] no items to output this cycle
I0320 16:24:38.546494 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 16:24:38.546501 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 16:24:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:24:43.410835 543705 memory.go:191] Add success.
I0320 16:24:43.409816 543705 cpu.go:282] Add success.
I0320 16:24:43.420513 543705 net.go:648] Add success.
I0320 16:24:43.423339 543705 net.go:770] primary dev: ETH0
I0320 16:24:43.423354 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:24:43.423368 543705 net.go:698] Add success.
I0320 16:24:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:24:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:24:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:24:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:24:53.409786 543705 memory.go:184] no items to output this cycle
I0320 16:24:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 16:25:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:25:03.409806 543705 memory.go:184] no items to output this cycle
I0320 16:25:03.409819 543705 cpu.go:275] no items to output this cycle
E0320 16:25:13.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:25:13.409832 543705 memory.go:191] Add success.
I0320 16:25:13.409839 543705 cpu.go:282] Add success.
W0320 16:25:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:25:13.409881 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:25:13.409885 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:25:13.420311 543705 net.go:648] Add success.
I0320 16:25:13.423290 543705 net.go:770] primary dev: ETH0
I0320 16:25:13.423306 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:25:13.423321 543705 net.go:698] Add success.
I0320 16:25:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:25:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:25:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 16:25:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:25:14.456567 543705 disk_worker.go:494] system disk:vda1
I0320 16:25:14.456598 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:25:15.455947 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:25:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:25:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:25:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:25:16.472381 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:25:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:25:23.409787 543705 memory.go:184] no items to output this cycle
I0320 16:25:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 16:25:25.141675 543705 disk_info.go:125] begin check local disk info of client
I0320 16:25:25.144112 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:25:25.144118 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000dc000 0xc0000dc040]
E0320 16:25:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:25:33.409777 543705 memory.go:184] no items to output this cycle
I0320 16:25:33.409814 543705 cpu.go:275] no items to output this cycle
E0320 16:25:43.409808 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:25:43.409844 543705 memory.go:191] Add success.
I0320 16:25:43.409844 543705 cpu.go:282] Add success.
I0320 16:25:43.420057 543705 net.go:648] Add success.
I0320 16:25:43.422819 543705 net.go:770] primary dev: ETH0
I0320 16:25:43.422833 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:25:43.422846 543705 net.go:698] Add success.
I0320 16:25:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:25:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:25:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:25:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:25:53.409793 543705 memory.go:184] no items to output this cycle
I0320 16:25:53.409826 543705 cpu.go:275] no items to output this cycle
E0320 16:26:03.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:26:03.409812 543705 memory.go:184] no items to output this cycle
I0320 16:26:03.409824 543705 cpu.go:275] no items to output this cycle
E0320 16:26:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:26:13.409827 543705 memory.go:191] Add success.
I0320 16:26:13.409836 543705 cpu.go:282] Add success.
W0320 16:26:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:26:13.409877 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:26:13.409881 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:26:13.420149 543705 net.go:648] Add success.
I0320 16:26:13.422768 543705 net.go:770] primary dev: ETH0
I0320 16:26:13.422794 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:26:13.422808 543705 net.go:698] Add success.
I0320 16:26:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:26:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:26:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 16:26:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:26:14.456817 543705 disk_worker.go:494] system disk:vda1
I0320 16:26:14.456846 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:26:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:26:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:26:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:26:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:26:16.472404 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:26:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:26:23.409799 543705 memory.go:184] no items to output this cycle
I0320 16:26:23.409802 543705 cpu.go:275] no items to output this cycle
I0320 16:26:25.145674 543705 disk_info.go:125] begin check local disk info of client
I0320 16:26:25.148132 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:26:25.148138 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004924c0 0xc000492500]
E0320 16:26:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:26:33.409776 543705 memory.go:184] no items to output this cycle
I0320 16:26:33.409782 543705 cpu.go:275] no items to output this cycle
E0320 16:26:43.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:26:43.409829 543705 memory.go:191] Add success.
I0320 16:26:43.409832 543705 cpu.go:282] Add success.
I0320 16:26:43.419884 543705 net.go:648] Add success.
I0320 16:26:43.422898 543705 net.go:770] primary dev: ETH0
I0320 16:26:43.422911 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:26:43.422924 543705 net.go:698] Add success.
I0320 16:26:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:26:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:26:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:26:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:26:53.409776 543705 memory.go:184] no items to output this cycle
I0320 16:26:53.409782 543705 cpu.go:275] no items to output this cycle
E0320 16:27:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:27:03.409780 543705 memory.go:184] no items to output this cycle
I0320 16:27:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 16:27:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:27:13.409778 543705 memory.go:191] Add success.
W0320 16:27:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:27:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:27:13.409815 543705 cpu.go:282] Add success.
I0320 16:27:13.409819 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:27:13.426076 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 16:27:13.426154 543705 net.go:770] primary dev: ETH0
I0320 16:27:13.426173 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:27:13.426189 543705 net.go:698] Add success.
I0320 16:27:13.426549 543705 net.go:648] Add success.
I0320 16:27:13.453055 543705 event_worker.go:152] Polling the log file for events...
I0320 16:27:13.477286 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8b5347e2-a208-4504-a081-8a0f39c3ea75","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 16:27:13.477317 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 16:27:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:27:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 16:27:14.455188 543705 disk_worker.go:728] disk inode is not compliant
E0320 16:27:14.455921 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 16:27:14.455931 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 16:27:14.455937 543705 custom_config.go:64] query custom config with name: gpu
I0320 16:27:14.456684 543705 disk_worker.go:494] system disk:vda1
I0320 16:27:14.456715 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 16:27:15.456862 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 16:27:15.456872 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:27:16.457925 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 16:27:16.457925 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 16:27:16.457979 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:27:16.457999 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:27:16.472320 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:27:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:27:23.409782 543705 memory.go:184] no items to output this cycle
I0320 16:27:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 16:27:25.149669 543705 disk_info.go:125] begin check local disk info of client
I0320 16:27:25.152142 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:27:25.152148 543705 disk_info.go:196] parse disk info done, disk is : [0xc000322080 0xc0003220c0]
E0320 16:27:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:27:33.409802 543705 memory.go:184] no items to output this cycle
I0320 16:27:33.409814 543705 cpu.go:275] no items to output this cycle
I0320 16:27:38.547493 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 16:27:38.547499 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 16:27:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:27:43.410674 543705 memory.go:191] Add success.
I0320 16:27:43.409820 543705 cpu.go:282] Add success.
I0320 16:27:43.420373 543705 net.go:648] Add success.
I0320 16:27:43.423202 543705 net.go:770] primary dev: ETH0
I0320 16:27:43.423216 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:27:43.423231 543705 net.go:698] Add success.
I0320 16:27:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:27:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:27:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:27:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:27:53.409808 543705 memory.go:184] no items to output this cycle
I0320 16:27:53.409824 543705 cpu.go:275] no items to output this cycle
E0320 16:28:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:28:03.409781 543705 memory.go:184] no items to output this cycle
I0320 16:28:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 16:28:13.409900 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:28:13.409936 543705 memory.go:191] Add success.
I0320 16:28:13.409978 543705 cpu.go:282] Add success.
W0320 16:28:13.410058 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:28:13.410090 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:28:13.410095 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:28:13.419725 543705 net.go:648] Add success.
I0320 16:28:13.422619 543705 net.go:770] primary dev: ETH0
I0320 16:28:13.422632 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:28:13.422644 543705 net.go:698] Add success.
I0320 16:28:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:28:14.455205 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:28:14.455216 543705 disk_worker.go:708] disk space is not compliant
W0320 16:28:14.455219 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:28:14.456646 543705 disk_worker.go:494] system disk:vda1
I0320 16:28:14.456678 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:28:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:28:16.458040 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:28:16.458101 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:28:16.458124 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:28:16.472466 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:28:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:28:23.409774 543705 memory.go:184] no items to output this cycle
I0320 16:28:23.409806 543705 cpu.go:275] no items to output this cycle
I0320 16:28:25.153673 543705 disk_info.go:125] begin check local disk info of client
I0320 16:28:25.156118 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:28:25.156124 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037d740 0xc00037d780]
E0320 16:28:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:28:33.409805 543705 memory.go:184] no items to output this cycle
I0320 16:28:33.409817 543705 cpu.go:275] no items to output this cycle
E0320 16:28:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:28:43.409795 543705 memory.go:191] Add success.
I0320 16:28:43.409815 543705 cpu.go:282] Add success.
I0320 16:28:43.419882 543705 net.go:648] Add success.
I0320 16:28:43.422649 543705 net.go:770] primary dev: ETH0
I0320 16:28:43.422663 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:28:43.422676 543705 net.go:698] Add success.
I0320 16:28:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:28:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:28:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:28:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:28:53.409771 543705 memory.go:184] no items to output this cycle
I0320 16:28:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 16:29:03.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:29:03.409814 543705 memory.go:184] no items to output this cycle
I0320 16:29:03.409828 543705 cpu.go:275] no items to output this cycle
E0320 16:29:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:29:13.409873 543705 memory.go:191] Add success.
W0320 16:29:13.409905 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:29:13.409918 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:29:13.409922 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:29:13.409951 543705 cpu.go:282] Add success.
I0320 16:29:13.419741 543705 net.go:648] Add success.
I0320 16:29:13.422685 543705 net.go:770] primary dev: ETH0
I0320 16:29:13.422699 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:29:13.422713 543705 net.go:698] Add success.
I0320 16:29:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:29:14.455158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:29:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0320 16:29:14.455171 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:29:14.456526 543705 disk_worker.go:494] system disk:vda1
I0320 16:29:14.456572 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:29:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:29:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:29:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:29:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:29:16.472429 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:29:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:29:23.409795 543705 memory.go:184] no items to output this cycle
I0320 16:29:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 16:29:25.157675 543705 disk_info.go:125] begin check local disk info of client
I0320 16:29:25.160241 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:29:25.160247 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4a00 0xc0000c4a40]
E0320 16:29:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:29:33.409811 543705 memory.go:184] no items to output this cycle
I0320 16:29:33.409825 543705 cpu.go:275] no items to output this cycle
E0320 16:29:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:29:43.409784 543705 memory.go:191] Add success.
I0320 16:29:43.409827 543705 cpu.go:282] Add success.
I0320 16:29:43.420024 543705 net.go:648] Add success.
I0320 16:29:43.423355 543705 net.go:770] primary dev: ETH0
I0320 16:29:43.423368 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:29:43.423383 543705 net.go:698] Add success.
I0320 16:29:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:29:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:29:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:29:53.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:29:53.409817 543705 memory.go:184] no items to output this cycle
I0320 16:29:53.409837 543705 cpu.go:275] no items to output this cycle
E0320 16:30:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:30:03.409781 543705 memory.go:184] no items to output this cycle
I0320 16:30:03.409782 543705 cpu.go:275] no items to output this cycle
E0320 16:30:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:30:13.409826 543705 memory.go:191] Add success.
I0320 16:30:13.409840 543705 cpu.go:282] Add success.
W0320 16:30:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:30:13.409889 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:30:13.409893 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:30:13.420258 543705 net.go:648] Add success.
I0320 16:30:13.422859 543705 net.go:770] primary dev: ETH0
I0320 16:30:13.422873 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:30:13.422885 543705 net.go:698] Add success.
I0320 16:30:13.468908 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7d7f05ab-0772-4a9e-9cad-bf2c243d8a6e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 16:30:13.468942 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 16:30:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:30:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:30:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0320 16:30:14.455176 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:30:14.456499 543705 disk_worker.go:494] system disk:vda1
I0320 16:30:14.456541 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:30:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:30:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:30:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:30:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:30:16.472416 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:30:23.410576 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:30:23.410583 543705 cpu.go:275] no items to output this cycle
I0320 16:30:23.410592 543705 memory.go:184] no items to output this cycle
I0320 16:30:25.161676 543705 disk_info.go:125] begin check local disk info of client
I0320 16:30:25.164135 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:30:25.164143 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e9b00 0xc0001fa000]
E0320 16:30:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:30:33.409781 543705 memory.go:184] no items to output this cycle
I0320 16:30:33.409783 543705 cpu.go:275] no items to output this cycle
I0320 16:30:38.548488 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 16:30:38.548495 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 16:30:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:30:43.410717 543705 memory.go:191] Add success.
I0320 16:30:43.409830 543705 cpu.go:282] Add success.
I0320 16:30:43.420502 543705 net.go:648] Add success.
I0320 16:30:43.423236 543705 net.go:770] primary dev: ETH0
I0320 16:30:43.423249 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:30:43.423261 543705 net.go:698] Add success.
I0320 16:30:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:30:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:30:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:30:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:30:53.409769 543705 memory.go:184] no items to output this cycle
I0320 16:30:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 16:31:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:31:03.409782 543705 memory.go:184] no items to output this cycle
I0320 16:31:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 16:31:13.409899 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:31:13.410016 543705 cpu.go:282] Add success.
I0320 16:31:13.410078 543705 memory.go:191] Add success.
W0320 16:31:13.410114 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:31:13.410143 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:31:13.410148 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:31:13.419731 543705 net.go:648] Add success.
I0320 16:31:13.423067 543705 net.go:770] primary dev: ETH0
I0320 16:31:13.423082 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:31:13.423096 543705 net.go:698] Add success.
I0320 16:31:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:31:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:31:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0320 16:31:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:31:14.456560 543705 disk_worker.go:494] system disk:vda1
I0320 16:31:14.456588 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:31:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:31:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:31:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:31:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:31:16.472376 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:31:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:31:23.409763 543705 memory.go:184] no items to output this cycle
I0320 16:31:23.409794 543705 cpu.go:275] no items to output this cycle
I0320 16:31:25.165674 543705 disk_info.go:125] begin check local disk info of client
I0320 16:31:25.168119 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:31:25.168125 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a77c0 0xc0004a7800]
I0320 16:31:33.409793 543705 cpu.go:275] no items to output this cycle
E0320 16:31:33.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:31:33.409813 543705 memory.go:184] no items to output this cycle
E0320 16:31:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:31:43.409784 543705 memory.go:191] Add success.
I0320 16:31:43.409813 543705 cpu.go:282] Add success.
I0320 16:31:43.419999 543705 net.go:648] Add success.
I0320 16:31:43.422581 543705 net.go:770] primary dev: ETH0
I0320 16:31:43.422595 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:31:43.422609 543705 net.go:698] Add success.
I0320 16:31:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:31:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:31:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:31:53.410370 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:31:53.410389 543705 memory.go:184] no items to output this cycle
I0320 16:31:53.410407 543705 cpu.go:275] no items to output this cycle
E0320 16:32:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:32:03.409769 543705 memory.go:184] no items to output this cycle
I0320 16:32:03.409792 543705 cpu.go:275] no items to output this cycle
W0320 16:32:13.409715 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:32:13.409736 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:32:13.409743 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:32:13.409832 543705 cpu.go:282] Add success.
E0320 16:32:13.409838 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:32:13.409856 543705 memory.go:191] Add success.
I0320 16:32:13.420263 543705 net.go:648] Add success.
I0320 16:32:13.423065 543705 net.go:770] primary dev: ETH0
I0320 16:32:13.423078 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:32:13.423090 543705 net.go:698] Add success.
W0320 16:32:14.455128 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:32:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 16:32:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:32:14.456802 543705 disk_worker.go:494] system disk:vda1
I0320 16:32:14.456838 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 16:32:14.457004 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 16:32:14.457028 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 16:32:14.457033 543705 custom_config.go:64] query custom config with name: gpu
E0320 16:32:15.456827 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 16:32:15.456835 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:32:16.457961 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 16:32:16.457961 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 16:32:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:32:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:32:16.472366 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:32:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:32:23.409792 543705 memory.go:184] no items to output this cycle
I0320 16:32:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 16:32:25.169671 543705 disk_info.go:125] begin check local disk info of client
I0320 16:32:25.172182 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:32:25.172189 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003694c0 0xc000369500]
E0320 16:32:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:32:33.409773 543705 memory.go:184] no items to output this cycle
I0320 16:32:33.409775 543705 cpu.go:275] no items to output this cycle
E0320 16:32:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:32:43.409819 543705 memory.go:191] Add success.
I0320 16:32:43.409829 543705 cpu.go:282] Add success.
I0320 16:32:43.419976 543705 net.go:648] Add success.
I0320 16:32:43.422931 543705 net.go:770] primary dev: ETH0
I0320 16:32:43.422944 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:32:43.422956 543705 net.go:698] Add success.
I0320 16:32:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:32:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:32:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:32:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:32:53.409778 543705 memory.go:184] no items to output this cycle
I0320 16:32:53.409780 543705 cpu.go:275] no items to output this cycle
E0320 16:33:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:33:03.409778 543705 memory.go:184] no items to output this cycle
I0320 16:33:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 16:33:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:33:13.409792 543705 memory.go:191] Add success.
I0320 16:33:13.409810 543705 cpu.go:282] Add success.
W0320 16:33:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:33:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:33:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:33:13.420288 543705 net.go:648] Add success.
I0320 16:33:13.422796 543705 net.go:770] primary dev: ETH0
I0320 16:33:13.422810 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:33:13.422821 543705 net.go:698] Add success.
I0320 16:33:13.468155 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"846895ce-5c25-488e-9f16-2cb1b19b4a90","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 16:33:13.468187 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 16:33:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:33:14.455169 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:33:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 16:33:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:33:14.456511 543705 disk_worker.go:494] system disk:vda1
I0320 16:33:14.456551 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:33:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:33:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:33:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:33:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:33:16.472397 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:33:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:33:23.409777 543705 memory.go:184] no items to output this cycle
I0320 16:33:23.409780 543705 cpu.go:275] no items to output this cycle
I0320 16:33:25.173674 543705 disk_info.go:125] begin check local disk info of client
I0320 16:33:25.176130 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:33:25.176136 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004ae1c0 0xc0004ae200]
E0320 16:33:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:33:33.409785 543705 cpu.go:275] no items to output this cycle
I0320 16:33:33.409790 543705 memory.go:184] no items to output this cycle
I0320 16:33:38.549496 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 16:33:38.549502 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 16:33:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:33:43.410698 543705 memory.go:191] Add success.
I0320 16:33:43.409829 543705 cpu.go:282] Add success.
I0320 16:33:43.420434 543705 net.go:648] Add success.
I0320 16:33:43.423261 543705 net.go:770] primary dev: ETH0
I0320 16:33:43.423274 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:33:43.423288 543705 net.go:698] Add success.
I0320 16:33:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:33:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:33:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:33:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 16:33:53.409803 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:33:53.409822 543705 memory.go:184] no items to output this cycle
E0320 16:34:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:34:03.409783 543705 memory.go:184] no items to output this cycle
I0320 16:34:03.409806 543705 cpu.go:275] no items to output this cycle
E0320 16:34:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:34:13.409793 543705 memory.go:191] Add success.
I0320 16:34:13.409796 543705 cpu.go:282] Add success.
W0320 16:34:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:34:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:34:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:34:13.420223 543705 net.go:648] Add success.
I0320 16:34:13.423304 543705 net.go:770] primary dev: ETH0
I0320 16:34:13.423321 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:34:13.423334 543705 net.go:698] Add success.
I0320 16:34:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:34:14.455126 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:34:14.455135 543705 disk_worker.go:708] disk space is not compliant
W0320 16:34:14.455138 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:34:14.456495 543705 disk_worker.go:494] system disk:vda1
I0320 16:34:14.456537 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:34:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:34:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:34:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:34:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:34:16.472425 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:34:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:34:23.409778 543705 memory.go:184] no items to output this cycle
I0320 16:34:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 16:34:25.177670 543705 disk_info.go:125] begin check local disk info of client
I0320 16:34:25.180194 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:34:25.180200 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035f940 0xc00035f980]
E0320 16:34:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:34:33.409773 543705 memory.go:184] no items to output this cycle
I0320 16:34:33.409780 543705 cpu.go:275] no items to output this cycle
E0320 16:34:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:34:43.409785 543705 memory.go:191] Add success.
I0320 16:34:43.409817 543705 cpu.go:282] Add success.
I0320 16:34:43.419856 543705 net.go:648] Add success.
I0320 16:34:43.422964 543705 net.go:770] primary dev: ETH0
I0320 16:34:43.422993 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:34:43.423008 543705 net.go:698] Add success.
I0320 16:34:46.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:34:46.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:34:46.458056 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:34:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:34:53.409774 543705 memory.go:184] no items to output this cycle
I0320 16:34:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 16:35:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:35:03.409783 543705 memory.go:184] no items to output this cycle
I0320 16:35:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 16:35:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:35:13.409782 543705 memory.go:191] Add success.
I0320 16:35:13.409799 543705 cpu.go:282] Add success.
W0320 16:35:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:35:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:35:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:35:13.420340 543705 net.go:648] Add success.
I0320 16:35:13.423058 543705 net.go:770] primary dev: ETH0
I0320 16:35:13.423071 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:35:13.423082 543705 net.go:698] Add success.
I0320 16:35:14.454949 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:35:14.455098 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:35:14.455160 543705 disk_worker.go:708] disk space is not compliant
W0320 16:35:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:35:14.456499 543705 disk_worker.go:494] system disk:vda1
I0320 16:35:14.456540 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:35:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:35:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:35:16.458022 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:35:16.458045 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:35:16.472384 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:35:23.410363 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:35:23.410379 543705 memory.go:184] no items to output this cycle
I0320 16:35:23.410413 543705 cpu.go:275] no items to output this cycle
I0320 16:35:25.181676 543705 disk_info.go:125] begin check local disk info of client
I0320 16:35:25.184160 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:35:25.184167 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047b700 0xc00047b740]
E0320 16:35:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:35:33.409797 543705 memory.go:184] no items to output this cycle
I0320 16:35:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 16:35:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:35:43.409796 543705 memory.go:191] Add success.
I0320 16:35:43.409797 543705 cpu.go:282] Add success.
I0320 16:35:43.419872 543705 net.go:648] Add success.
I0320 16:35:43.422594 543705 net.go:770] primary dev: ETH0
I0320 16:35:43.422607 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:35:43.422619 543705 net.go:698] Add success.
I0320 16:35:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:35:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:35:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:35:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 16:35:53.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:35:53.409821 543705 memory.go:184] no items to output this cycle
E0320 16:36:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:36:03.409785 543705 memory.go:184] no items to output this cycle
I0320 16:36:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 16:36:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:36:13.409784 543705 memory.go:191] Add success.
W0320 16:36:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 16:36:13.409817 543705 cpu.go:282] Add success.
W0320 16:36:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:36:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:36:13.420368 543705 net.go:648] Add success.
I0320 16:36:13.423452 543705 net.go:770] primary dev: ETH0
I0320 16:36:13.423466 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:36:13.423477 543705 net.go:698] Add success.
I0320 16:36:13.468938 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f03743fd-53d9-49b4-b151-054000a11ec2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 16:36:13.468969 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 16:36:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:36:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:36:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0320 16:36:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:36:14.456590 543705 disk_worker.go:494] system disk:vda1
I0320 16:36:14.456618 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:36:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:36:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:36:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:36:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:36:16.472375 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:36:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:36:23.409766 543705 memory.go:184] no items to output this cycle
I0320 16:36:23.409797 543705 cpu.go:275] no items to output this cycle
I0320 16:36:25.185671 543705 disk_info.go:125] begin check local disk info of client
I0320 16:36:25.188108 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:36:25.188114 543705 disk_info.go:196] parse disk info done, disk is : [0xc00036b040 0xc00036b080]
E0320 16:36:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:36:33.409767 543705 memory.go:184] no items to output this cycle
I0320 16:36:33.409798 543705 cpu.go:275] no items to output this cycle
I0320 16:36:38.550506 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 16:36:38.550513 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 16:36:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:36:43.410620 543705 memory.go:191] Add success.
I0320 16:36:43.409828 543705 cpu.go:282] Add success.
I0320 16:36:43.420384 543705 net.go:648] Add success.
I0320 16:36:43.422828 543705 net.go:770] primary dev: ETH0
I0320 16:36:43.422844 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:36:43.422859 543705 net.go:698] Add success.
I0320 16:36:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:36:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:36:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:36:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:36:53.409784 543705 memory.go:184] no items to output this cycle
I0320 16:36:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 16:37:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:37:03.409779 543705 memory.go:184] no items to output this cycle
I0320 16:37:03.409811 543705 cpu.go:275] no items to output this cycle
W0320 16:37:13.409707 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:37:13.409722 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:37:13.409727 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0320 16:37:13.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:37:13.409813 543705 cpu.go:282] Add success.
I0320 16:37:13.409819 543705 memory.go:191] Add success.
I0320 16:37:13.420154 543705 net.go:648] Add success.
I0320 16:37:13.422940 543705 net.go:770] primary dev: ETH0
I0320 16:37:13.422953 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:37:13.422965 543705 net.go:698] Add success.
I0320 16:37:13.452771 543705 event_worker.go:152] Polling the log file for events...
W0320 16:37:14.455144 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:37:14.455154 543705 disk_worker.go:708] disk space is not compliant
W0320 16:37:14.455157 543705 disk_worker.go:728] disk inode is not compliant
E0320 16:37:14.456929 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 16:37:14.456938 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 16:37:14.456945 543705 custom_config.go:64] query custom config with name: gpu
I0320 16:37:14.456994 543705 disk_worker.go:494] system disk:vda1
I0320 16:37:14.457024 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 16:37:15.456808 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 16:37:15.456818 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:37:16.457918 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 16:37:16.457920 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 16:37:16.457972 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:37:16.457991 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:37:16.472328 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:37:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:37:23.409790 543705 memory.go:184] no items to output this cycle
I0320 16:37:23.409800 543705 cpu.go:275] no items to output this cycle
I0320 16:37:25.189673 543705 disk_info.go:125] begin check local disk info of client
I0320 16:37:25.192142 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:37:25.192149 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032af80 0xc00032afc0]
E0320 16:37:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:37:33.409780 543705 cpu.go:275] no items to output this cycle
I0320 16:37:33.409790 543705 memory.go:184] no items to output this cycle
E0320 16:37:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:37:43.409817 543705 memory.go:191] Add success.
I0320 16:37:43.409829 543705 cpu.go:282] Add success.
I0320 16:37:43.420066 543705 net.go:648] Add success.
I0320 16:37:43.422836 543705 net.go:770] primary dev: ETH0
I0320 16:37:43.422849 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:37:43.422862 543705 net.go:698] Add success.
I0320 16:37:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:37:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:37:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:37:53.410399 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:37:53.410416 543705 memory.go:184] no items to output this cycle
I0320 16:37:53.410472 543705 cpu.go:275] no items to output this cycle
E0320 16:38:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:38:03.409783 543705 memory.go:184] no items to output this cycle
I0320 16:38:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 16:38:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:38:13.409799 543705 memory.go:191] Add success.
I0320 16:38:13.409800 543705 cpu.go:282] Add success.
W0320 16:38:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:38:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:38:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:38:13.420304 543705 net.go:648] Add success.
I0320 16:38:13.423064 543705 net.go:770] primary dev: ETH0
I0320 16:38:13.423077 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:38:13.423089 543705 net.go:698] Add success.
I0320 16:38:14.454948 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:38:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:38:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0320 16:38:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:38:14.456532 543705 disk_worker.go:494] system disk:vda1
I0320 16:38:14.456561 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:38:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:38:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:38:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:38:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:38:16.472437 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:38:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:38:23.409766 543705 memory.go:184] no items to output this cycle
I0320 16:38:23.409786 543705 cpu.go:275] no items to output this cycle
I0320 16:38:25.193681 543705 disk_info.go:125] begin check local disk info of client
I0320 16:38:25.196132 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:38:25.196139 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e9dc0 0xc0003e9e00]
E0320 16:38:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:38:33.409801 543705 memory.go:184] no items to output this cycle
I0320 16:38:33.409809 543705 cpu.go:275] no items to output this cycle
E0320 16:38:43.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:38:43.409827 543705 memory.go:191] Add success.
I0320 16:38:43.409832 543705 cpu.go:282] Add success.
I0320 16:38:43.419992 543705 net.go:648] Add success.
I0320 16:38:43.422809 543705 net.go:770] primary dev: ETH0
I0320 16:38:43.422822 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:38:43.422835 543705 net.go:698] Add success.
I0320 16:38:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:38:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:38:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:38:53.410253 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:38:53.410278 543705 memory.go:184] no items to output this cycle
I0320 16:38:53.410290 543705 cpu.go:275] no items to output this cycle
E0320 16:39:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:39:03.409779 543705 memory.go:184] no items to output this cycle
I0320 16:39:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 16:39:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:39:13.409801 543705 memory.go:191] Add success.
I0320 16:39:13.409820 543705 cpu.go:282] Add success.
W0320 16:39:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:39:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:39:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:39:13.420172 543705 net.go:648] Add success.
I0320 16:39:13.422971 543705 net.go:770] primary dev: ETH0
I0320 16:39:13.422984 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:39:13.422996 543705 net.go:698] Add success.
I0320 16:39:13.463868 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e03d522e-c5ec-45a8-b0c9-0443e74fdb3e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 16:39:13.463902 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 16:39:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:39:14.455169 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:39:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 16:39:14.455182 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:39:14.456548 543705 disk_worker.go:494] system disk:vda1
I0320 16:39:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:39:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:39:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:39:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:39:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:39:16.472381 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:39:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:39:23.409794 543705 memory.go:184] no items to output this cycle
I0320 16:39:23.409805 543705 cpu.go:275] no items to output this cycle
I0320 16:39:25.197672 543705 disk_info.go:125] begin check local disk info of client
I0320 16:39:25.200145 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:39:25.200151 543705 disk_info.go:196] parse disk info done, disk is : [0xc000470340 0xc000470380]
E0320 16:39:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:39:33.409775 543705 memory.go:184] no items to output this cycle
I0320 16:39:33.409795 543705 cpu.go:275] no items to output this cycle
I0320 16:39:38.551506 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 16:39:38.551513 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 16:39:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:39:43.410731 543705 memory.go:191] Add success.
I0320 16:39:43.409835 543705 cpu.go:282] Add success.
I0320 16:39:43.420482 543705 net.go:648] Add success.
I0320 16:39:43.423306 543705 net.go:770] primary dev: ETH0
I0320 16:39:43.423320 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:39:43.423332 543705 net.go:698] Add success.
I0320 16:39:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:39:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:39:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:39:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:39:53.409873 543705 cpu.go:275] no items to output this cycle
I0320 16:39:53.409888 543705 memory.go:184] no items to output this cycle
E0320 16:40:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:40:03.409771 543705 memory.go:184] no items to output this cycle
I0320 16:40:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 16:40:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:40:13.409803 543705 memory.go:191] Add success.
I0320 16:40:13.409805 543705 cpu.go:282] Add success.
W0320 16:40:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:40:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:40:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:40:13.420062 543705 net.go:648] Add success.
I0320 16:40:13.422891 543705 net.go:770] primary dev: ETH0
I0320 16:40:13.422903 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:40:13.422916 543705 net.go:698] Add success.
I0320 16:40:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:40:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:40:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0320 16:40:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:40:14.456525 543705 disk_worker.go:494] system disk:vda1
I0320 16:40:14.456568 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:40:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:40:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:40:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:40:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:40:16.472401 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:40:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:40:23.409781 543705 memory.go:184] no items to output this cycle
I0320 16:40:23.409784 543705 cpu.go:275] no items to output this cycle
I0320 16:40:25.201684 543705 disk_info.go:125] begin check local disk info of client
I0320 16:40:25.204135 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:40:25.204140 543705 disk_info.go:196] parse disk info done, disk is : [0xc000471c00 0xc000471c40]
E0320 16:40:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:40:33.409775 543705 memory.go:184] no items to output this cycle
I0320 16:40:33.409803 543705 cpu.go:275] no items to output this cycle
E0320 16:40:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:40:43.409827 543705 memory.go:191] Add success.
I0320 16:40:43.409841 543705 cpu.go:282] Add success.
I0320 16:40:43.419960 543705 net.go:648] Add success.
I0320 16:40:43.423045 543705 net.go:770] primary dev: ETH0
I0320 16:40:43.423060 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:40:43.423232 543705 net.go:698] Add success.
I0320 16:40:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:40:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:40:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:40:53.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:40:53.409812 543705 memory.go:184] no items to output this cycle
I0320 16:40:53.409828 543705 cpu.go:275] no items to output this cycle
E0320 16:41:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:41:03.409788 543705 memory.go:184] no items to output this cycle
I0320 16:41:03.409826 543705 cpu.go:275] no items to output this cycle
E0320 16:41:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:41:13.409818 543705 memory.go:191] Add success.
I0320 16:41:13.409818 543705 cpu.go:282] Add success.
W0320 16:41:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:41:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:41:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:41:13.420151 543705 net.go:648] Add success.
I0320 16:41:13.423010 543705 net.go:770] primary dev: ETH0
I0320 16:41:13.423022 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:41:13.423035 543705 net.go:698] Add success.
I0320 16:41:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:41:14.455196 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:41:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0320 16:41:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:41:14.456585 543705 disk_worker.go:494] system disk:vda1
I0320 16:41:14.456615 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:41:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:41:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:41:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:41:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:41:16.472410 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:41:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:41:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 16:41:23.409795 543705 memory.go:184] no items to output this cycle
I0320 16:41:25.205672 543705 disk_info.go:125] begin check local disk info of client
I0320 16:41:25.208129 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:41:25.208135 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb600 0xc0001fb640]
E0320 16:41:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:41:33.409786 543705 cpu.go:275] no items to output this cycle
I0320 16:41:33.409795 543705 memory.go:184] no items to output this cycle
E0320 16:41:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:41:43.409808 543705 memory.go:191] Add success.
I0320 16:41:43.409808 543705 cpu.go:282] Add success.
I0320 16:41:43.419969 543705 net.go:648] Add success.
I0320 16:41:43.422462 543705 net.go:770] primary dev: ETH0
I0320 16:41:43.422474 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:41:43.422649 543705 net.go:698] Add success.
I0320 16:41:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:41:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:41:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:41:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:41:53.409782 543705 memory.go:184] no items to output this cycle
I0320 16:41:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 16:42:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:42:03.409813 543705 memory.go:184] no items to output this cycle
I0320 16:42:03.409822 543705 cpu.go:275] no items to output this cycle
E0320 16:42:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:42:13.409780 543705 memory.go:191] Add success.
W0320 16:42:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 16:42:13.409809 543705 cpu.go:282] Add success.
W0320 16:42:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:42:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:42:13.420159 543705 net.go:648] Add success.
I0320 16:42:13.422753 543705 net.go:770] primary dev: ETH0
I0320 16:42:13.422765 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:42:13.422778 543705 net.go:698] Add success.
I0320 16:42:13.463559 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8d13ef04-0f33-48cf-88f9-27ab103e34e9","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 16:42:13.463591 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 16:42:14.455148 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:42:14.455236 543705 disk_worker.go:708] disk space is not compliant
W0320 16:42:14.455241 543705 disk_worker.go:728] disk inode is not compliant
E0320 16:42:14.456116 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 16:42:14.456126 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 16:42:14.456131 543705 custom_config.go:64] query custom config with name: gpu
I0320 16:42:14.456981 543705 disk_worker.go:494] system disk:vda1
I0320 16:42:14.457009 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 16:42:15.456867 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 16:42:15.456876 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:42:16.457949 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 16:42:16.457960 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 16:42:16.458000 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:42:16.458017 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:42:16.472383 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:42:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:42:23.409774 543705 memory.go:184] no items to output this cycle
I0320 16:42:23.409778 543705 cpu.go:275] no items to output this cycle
I0320 16:42:25.209668 543705 disk_info.go:125] begin check local disk info of client
I0320 16:42:25.212063 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:42:25.212069 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa500 0xc0001fa540]
E0320 16:42:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:42:33.409795 543705 memory.go:184] no items to output this cycle
I0320 16:42:33.409807 543705 cpu.go:275] no items to output this cycle
I0320 16:42:38.552513 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 16:42:38.552519 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 16:42:43.409892 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:42:43.410887 543705 memory.go:191] Add success.
I0320 16:42:43.409933 543705 cpu.go:282] Add success.
I0320 16:42:43.419725 543705 net.go:648] Add success.
I0320 16:42:43.422202 543705 net.go:770] primary dev: ETH0
I0320 16:42:43.422215 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:42:43.422228 543705 net.go:698] Add success.
I0320 16:42:46.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:42:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:42:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:42:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:42:53.409778 543705 memory.go:184] no items to output this cycle
I0320 16:42:53.409783 543705 cpu.go:275] no items to output this cycle
E0320 16:43:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:43:03.409782 543705 memory.go:184] no items to output this cycle
I0320 16:43:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 16:43:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:43:13.409792 543705 memory.go:191] Add success.
I0320 16:43:13.409798 543705 cpu.go:282] Add success.
W0320 16:43:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:43:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:43:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:43:13.420230 543705 net.go:648] Add success.
I0320 16:43:13.422920 543705 net.go:770] primary dev: ETH0
I0320 16:43:13.422936 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:43:13.422951 543705 net.go:698] Add success.
I0320 16:43:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:43:14.455215 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:43:14.455226 543705 disk_worker.go:708] disk space is not compliant
W0320 16:43:14.455229 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:43:14.456610 543705 disk_worker.go:494] system disk:vda1
I0320 16:43:14.456641 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:43:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:43:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:43:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:43:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:43:16.472437 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:43:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:43:23.409801 543705 memory.go:184] no items to output this cycle
I0320 16:43:23.409807 543705 cpu.go:275] no items to output this cycle
I0320 16:43:25.213672 543705 disk_info.go:125] begin check local disk info of client
I0320 16:43:25.216168 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:43:25.216174 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002044c0 0xc000204500]
E0320 16:43:33.409874 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:43:33.409880 543705 cpu.go:275] no items to output this cycle
I0320 16:43:33.409912 543705 memory.go:184] no items to output this cycle
E0320 16:43:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:43:43.409825 543705 memory.go:191] Add success.
I0320 16:43:43.409832 543705 cpu.go:282] Add success.
I0320 16:43:43.420027 543705 net.go:648] Add success.
I0320 16:43:43.422661 543705 net.go:770] primary dev: ETH0
I0320 16:43:43.422676 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:43:43.422690 543705 net.go:698] Add success.
I0320 16:43:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:43:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:43:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:43:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:43:53.409799 543705 memory.go:184] no items to output this cycle
I0320 16:43:53.409814 543705 cpu.go:275] no items to output this cycle
E0320 16:44:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:44:03.409784 543705 memory.go:184] no items to output this cycle
I0320 16:44:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 16:44:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:44:13.409791 543705 cpu.go:282] Add success.
I0320 16:44:13.409793 543705 memory.go:191] Add success.
W0320 16:44:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:44:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:44:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:44:13.420099 543705 net.go:648] Add success.
I0320 16:44:13.422878 543705 net.go:770] primary dev: ETH0
I0320 16:44:13.422891 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:44:13.422902 543705 net.go:698] Add success.
I0320 16:44:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:44:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:44:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 16:44:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:44:14.456600 543705 disk_worker.go:494] system disk:vda1
I0320 16:44:14.456632 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:44:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:44:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:44:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:44:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:44:16.472377 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:44:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:44:23.409780 543705 cpu.go:275] no items to output this cycle
I0320 16:44:23.409784 543705 memory.go:184] no items to output this cycle
I0320 16:44:25.217672 543705 disk_info.go:125] begin check local disk info of client
I0320 16:44:25.220122 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:44:25.220128 543705 disk_info.go:196] parse disk info done, disk is : [0xc000391580 0xc0003915c0]
E0320 16:44:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:44:33.409799 543705 memory.go:184] no items to output this cycle
I0320 16:44:33.409814 543705 cpu.go:275] no items to output this cycle
E0320 16:44:43.409883 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:44:43.409917 543705 memory.go:191] Add success.
I0320 16:44:43.410014 543705 cpu.go:282] Add success.
I0320 16:44:43.419731 543705 net.go:648] Add success.
I0320 16:44:43.422835 543705 net.go:770] primary dev: ETH0
I0320 16:44:43.422847 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:44:43.422859 543705 net.go:698] Add success.
I0320 16:44:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:44:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:44:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:44:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:44:53.409779 543705 memory.go:184] no items to output this cycle
I0320 16:44:53.409780 543705 cpu.go:275] no items to output this cycle
E0320 16:45:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:45:03.409788 543705 memory.go:184] no items to output this cycle
I0320 16:45:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 16:45:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:45:13.409784 543705 memory.go:191] Add success.
I0320 16:45:13.409789 543705 cpu.go:282] Add success.
W0320 16:45:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:45:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:45:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:45:13.420122 543705 net.go:648] Add success.
I0320 16:45:13.422794 543705 net.go:770] primary dev: ETH0
I0320 16:45:13.422807 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:45:13.422819 543705 net.go:698] Add success.
I0320 16:45:13.469663 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9b469a6c-d59a-4c5d-a281-4b6078ee9b60","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 16:45:13.469695 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 16:45:14.454980 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:45:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:45:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 16:45:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:45:14.456525 543705 disk_worker.go:494] system disk:vda1
I0320 16:45:14.456578 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:45:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:45:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:45:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:45:16.458049 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:45:16.472373 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:45:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:45:23.409765 543705 memory.go:184] no items to output this cycle
I0320 16:45:23.409794 543705 cpu.go:275] no items to output this cycle
I0320 16:45:25.221675 543705 disk_info.go:125] begin check local disk info of client
I0320 16:45:25.224119 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:45:25.224125 543705 disk_info.go:196] parse disk info done, disk is : [0xc000278040 0xc000278080]
E0320 16:45:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:45:33.409809 543705 memory.go:184] no items to output this cycle
I0320 16:45:33.409822 543705 cpu.go:275] no items to output this cycle
I0320 16:45:38.553525 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 16:45:38.553532 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 16:45:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:45:43.410610 543705 memory.go:191] Add success.
I0320 16:45:43.409799 543705 cpu.go:282] Add success.
I0320 16:45:43.420318 543705 net.go:648] Add success.
I0320 16:45:43.423083 543705 net.go:770] primary dev: ETH0
I0320 16:45:43.423099 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:45:43.423115 543705 net.go:698] Add success.
I0320 16:45:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:45:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:45:46.458154 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:45:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:45:53.409785 543705 memory.go:184] no items to output this cycle
I0320 16:45:53.409822 543705 cpu.go:275] no items to output this cycle
E0320 16:46:03.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:46:03.409818 543705 memory.go:184] no items to output this cycle
I0320 16:46:03.409834 543705 cpu.go:275] no items to output this cycle
E0320 16:46:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:46:13.409820 543705 memory.go:191] Add success.
I0320 16:46:13.409830 543705 cpu.go:282] Add success.
W0320 16:46:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:46:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:46:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:46:13.420203 543705 net.go:648] Add success.
I0320 16:46:13.422930 543705 net.go:770] primary dev: ETH0
I0320 16:46:13.422943 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:46:13.422955 543705 net.go:698] Add success.
I0320 16:46:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:46:14.455188 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:46:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 16:46:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:46:14.456551 543705 disk_worker.go:494] system disk:vda1
I0320 16:46:14.456583 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:46:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:46:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:46:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:46:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:46:16.472371 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:46:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:46:23.409773 543705 memory.go:184] no items to output this cycle
I0320 16:46:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 16:46:25.225673 543705 disk_info.go:125] begin check local disk info of client
I0320 16:46:25.228106 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:46:25.228112 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4d40 0xc0000c4d80]
E0320 16:46:33.409810 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:46:33.409830 543705 memory.go:184] no items to output this cycle
I0320 16:46:33.409843 543705 cpu.go:275] no items to output this cycle
E0320 16:46:43.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:46:43.409838 543705 memory.go:191] Add success.
I0320 16:46:43.409839 543705 cpu.go:282] Add success.
I0320 16:46:43.420024 543705 net.go:648] Add success.
I0320 16:46:43.422701 543705 net.go:770] primary dev: ETH0
I0320 16:46:43.422716 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:46:43.422732 543705 net.go:698] Add success.
I0320 16:46:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:46:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:46:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:46:53.409898 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:46:53.409916 543705 memory.go:184] no items to output this cycle
I0320 16:46:53.409968 543705 cpu.go:275] no items to output this cycle
E0320 16:47:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:47:03.409790 543705 memory.go:184] no items to output this cycle
I0320 16:47:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 16:47:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:47:13.409818 543705 memory.go:191] Add success.
I0320 16:47:13.409832 543705 cpu.go:282] Add success.
W0320 16:47:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:47:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:47:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:47:13.420173 543705 net.go:648] Add success.
I0320 16:47:13.423318 543705 net.go:770] primary dev: ETH0
I0320 16:47:13.423332 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:47:13.423345 543705 net.go:698] Add success.
I0320 16:47:13.452770 543705 event_worker.go:152] Polling the log file for events...
W0320 16:47:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:47:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 16:47:14.455169 543705 disk_worker.go:728] disk inode is not compliant
E0320 16:47:14.456909 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 16:47:14.456918 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 16:47:14.456924 543705 custom_config.go:64] query custom config with name: gpu
I0320 16:47:14.456972 543705 disk_worker.go:494] system disk:vda1
I0320 16:47:14.457011 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 16:47:15.456818 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 16:47:15.456827 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:47:16.457920 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 16:47:16.457919 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 16:47:16.457972 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:47:16.457991 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:47:16.472395 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:47:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:47:23.409762 543705 memory.go:184] no items to output this cycle
I0320 16:47:23.409807 543705 cpu.go:275] no items to output this cycle
I0320 16:47:25.229671 543705 disk_info.go:125] begin check local disk info of client
I0320 16:47:25.232124 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:47:25.232130 543705 disk_info.go:196] parse disk info done, disk is : [0xc000586b40 0xc000586b80]
E0320 16:47:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:47:33.409795 543705 memory.go:184] no items to output this cycle
I0320 16:47:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 16:47:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:47:43.409820 543705 memory.go:191] Add success.
I0320 16:47:43.409837 543705 cpu.go:282] Add success.
I0320 16:47:43.419896 543705 net.go:648] Add success.
I0320 16:47:43.422513 543705 net.go:770] primary dev: ETH0
I0320 16:47:43.422527 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:47:43.422542 543705 net.go:698] Add success.
I0320 16:47:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:47:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:47:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:47:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:47:53.409767 543705 memory.go:184] no items to output this cycle
I0320 16:47:53.409895 543705 cpu.go:275] no items to output this cycle
E0320 16:48:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:48:03.409787 543705 memory.go:184] no items to output this cycle
I0320 16:48:03.409802 543705 cpu.go:275] no items to output this cycle
E0320 16:48:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:48:13.409784 543705 memory.go:191] Add success.
I0320 16:48:13.409795 543705 cpu.go:282] Add success.
W0320 16:48:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:48:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:48:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:48:13.420288 543705 net.go:648] Add success.
I0320 16:48:13.423312 543705 net.go:770] primary dev: ETH0
I0320 16:48:13.423325 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:48:13.423337 543705 net.go:698] Add success.
I0320 16:48:13.732066 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"253a2be5-43aa-44f6-8306-b194508758ac","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 16:48:13.732108 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 16:48:14.454736 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:48:14.454952 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:48:14.454963 543705 disk_worker.go:708] disk space is not compliant
W0320 16:48:14.454965 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:48:14.456385 543705 disk_worker.go:494] system disk:vda1
I0320 16:48:14.456416 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:48:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:48:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:48:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:48:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:48:16.472403 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:48:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:48:23.409767 543705 memory.go:184] no items to output this cycle
I0320 16:48:23.409802 543705 cpu.go:275] no items to output this cycle
I0320 16:48:25.233671 543705 disk_info.go:125] begin check local disk info of client
I0320 16:48:25.236102 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:48:25.236107 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003124c0 0xc000312500]
E0320 16:48:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:48:33.409797 543705 memory.go:184] no items to output this cycle
I0320 16:48:33.409810 543705 cpu.go:275] no items to output this cycle
I0320 16:48:38.554509 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 16:48:38.554516 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 16:48:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:48:43.410683 543705 memory.go:191] Add success.
I0320 16:48:43.409803 543705 cpu.go:282] Add success.
I0320 16:48:43.420373 543705 net.go:648] Add success.
I0320 16:48:43.423206 543705 net.go:770] primary dev: ETH0
I0320 16:48:43.423219 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:48:43.423231 543705 net.go:698] Add success.
I0320 16:48:46.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:48:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:48:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:48:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:48:53.409772 543705 memory.go:184] no items to output this cycle
I0320 16:48:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 16:49:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:49:03.409784 543705 memory.go:184] no items to output this cycle
I0320 16:49:03.409806 543705 cpu.go:275] no items to output this cycle
E0320 16:49:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:49:13.409812 543705 memory.go:191] Add success.
I0320 16:49:13.409817 543705 cpu.go:282] Add success.
W0320 16:49:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:49:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:49:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:49:13.420138 543705 net.go:648] Add success.
I0320 16:49:13.422998 543705 net.go:770] primary dev: ETH0
I0320 16:49:13.423011 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:49:13.423022 543705 net.go:698] Add success.
I0320 16:49:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:49:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:49:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 16:49:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:49:14.456579 543705 disk_worker.go:494] system disk:vda1
I0320 16:49:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:49:15.455974 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:49:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:49:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:49:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:49:16.472479 543705 disk_local_worker.go:436] Get disk info: []
I0320 16:49:23.409774 543705 cpu.go:275] no items to output this cycle
E0320 16:49:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:49:23.409789 543705 memory.go:184] no items to output this cycle
I0320 16:49:25.237676 543705 disk_info.go:125] begin check local disk info of client
I0320 16:49:25.240087 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:49:25.240093 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aaa80 0xc0001aaac0]
E0320 16:49:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:49:33.409779 543705 memory.go:184] no items to output this cycle
I0320 16:49:33.409786 543705 cpu.go:275] no items to output this cycle
E0320 16:49:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:49:43.409817 543705 memory.go:191] Add success.
I0320 16:49:43.409827 543705 cpu.go:282] Add success.
I0320 16:49:43.419987 543705 net.go:648] Add success.
I0320 16:49:43.422709 543705 net.go:770] primary dev: ETH0
I0320 16:49:43.422724 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:49:43.422740 543705 net.go:698] Add success.
I0320 16:49:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:49:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:49:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:49:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:49:53.409776 543705 memory.go:184] no items to output this cycle
I0320 16:49:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 16:50:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:50:03.409899 543705 cpu.go:275] no items to output this cycle
I0320 16:50:03.409908 543705 memory.go:184] no items to output this cycle
E0320 16:50:13.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:50:13.409768 543705 memory.go:191] Add success.
W0320 16:50:13.409793 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:50:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:50:13.409808 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:50:13.409814 543705 cpu.go:282] Add success.
I0320 16:50:13.420058 543705 net.go:648] Add success.
I0320 16:50:13.423330 543705 net.go:770] primary dev: ETH0
I0320 16:50:13.423343 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:50:13.423355 543705 net.go:698] Add success.
I0320 16:50:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:50:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:50:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 16:50:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:50:14.456588 543705 disk_worker.go:494] system disk:vda1
I0320 16:50:14.456618 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:50:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:50:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:50:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:50:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:50:16.472419 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:50:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:50:23.409783 543705 memory.go:184] no items to output this cycle
I0320 16:50:23.409782 543705 cpu.go:275] no items to output this cycle
I0320 16:50:25.241676 543705 disk_info.go:125] begin check local disk info of client
I0320 16:50:25.244123 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:50:25.244130 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007bdc0 0xc00007be00]
E0320 16:50:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:50:33.409797 543705 memory.go:184] no items to output this cycle
I0320 16:50:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 16:50:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:50:43.409800 543705 cpu.go:282] Add success.
I0320 16:50:43.409802 543705 memory.go:191] Add success.
I0320 16:50:43.419959 543705 net.go:648] Add success.
I0320 16:50:43.422754 543705 net.go:770] primary dev: ETH0
I0320 16:50:43.422766 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:50:43.422779 543705 net.go:698] Add success.
I0320 16:50:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:50:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:50:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:50:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:50:53.409803 543705 memory.go:184] no items to output this cycle
I0320 16:50:53.409815 543705 cpu.go:275] no items to output this cycle
E0320 16:51:03.409883 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:51:03.409890 543705 cpu.go:275] no items to output this cycle
I0320 16:51:03.409902 543705 memory.go:184] no items to output this cycle
E0320 16:51:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:51:13.409796 543705 memory.go:191] Add success.
W0320 16:51:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 16:51:13.409830 543705 cpu.go:282] Add success.
W0320 16:51:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:51:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:51:13.420131 543705 net.go:648] Add success.
I0320 16:51:13.422760 543705 net.go:770] primary dev: ETH0
I0320 16:51:13.422773 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:51:13.422785 543705 net.go:698] Add success.
I0320 16:51:13.518440 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b3f2401f-2fa5-479c-a908-8d0d8e5b9b1b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 16:51:13.518472 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 16:51:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:51:14.455129 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:51:14.455218 543705 disk_worker.go:708] disk space is not compliant
W0320 16:51:14.455221 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:51:14.456729 543705 disk_worker.go:494] system disk:vda1
I0320 16:51:14.456758 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:51:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:51:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:51:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:51:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:51:16.472419 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:51:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:51:23.409772 543705 memory.go:184] no items to output this cycle
I0320 16:51:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 16:51:25.245674 543705 disk_info.go:125] begin check local disk info of client
I0320 16:51:25.248108 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:51:25.248113 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007a700 0xc00007a740]
E0320 16:51:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:51:33.409778 543705 cpu.go:275] no items to output this cycle
I0320 16:51:33.409779 543705 memory.go:184] no items to output this cycle
I0320 16:51:38.555518 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 16:51:38.555525 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 16:51:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:51:43.410667 543705 memory.go:191] Add success.
I0320 16:51:43.409831 543705 cpu.go:282] Add success.
I0320 16:51:43.420375 543705 net.go:648] Add success.
I0320 16:51:43.423011 543705 net.go:770] primary dev: ETH0
I0320 16:51:43.423024 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:51:43.423037 543705 net.go:698] Add success.
I0320 16:51:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:51:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:51:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:51:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:51:53.409809 543705 memory.go:184] no items to output this cycle
I0320 16:51:53.409817 543705 cpu.go:275] no items to output this cycle
E0320 16:52:03.409909 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:52:03.409930 543705 memory.go:184] no items to output this cycle
I0320 16:52:03.409979 543705 cpu.go:275] no items to output this cycle
E0320 16:52:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:52:13.409821 543705 memory.go:191] Add success.
I0320 16:52:13.409830 543705 cpu.go:282] Add success.
W0320 16:52:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:52:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:52:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:52:13.420260 543705 net.go:648] Add success.
I0320 16:52:13.422973 543705 net.go:770] primary dev: ETH0
I0320 16:52:13.422991 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:52:13.423005 543705 net.go:698] Add success.
W0320 16:52:14.455118 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:52:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0320 16:52:14.455197 543705 disk_worker.go:728] disk inode is not compliant
E0320 16:52:14.455905 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 16:52:14.455914 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 16:52:14.455920 543705 custom_config.go:64] query custom config with name: gpu
I0320 16:52:14.456567 543705 disk_worker.go:494] system disk:vda1
I0320 16:52:14.456596 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 16:52:15.456792 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 16:52:15.456801 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:52:16.457907 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 16:52:16.457907 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 16:52:16.457959 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:52:16.457979 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:52:16.472300 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:52:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:52:23.409796 543705 memory.go:184] no items to output this cycle
I0320 16:52:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 16:52:25.249671 543705 disk_info.go:125] begin check local disk info of client
I0320 16:52:25.252106 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:52:25.252111 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa980 0xc0001aa9c0]
E0320 16:52:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:52:33.409770 543705 memory.go:184] no items to output this cycle
I0320 16:52:33.409789 543705 cpu.go:275] no items to output this cycle
E0320 16:52:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:52:43.409820 543705 memory.go:191] Add success.
I0320 16:52:43.409829 543705 cpu.go:282] Add success.
I0320 16:52:43.420008 543705 net.go:648] Add success.
I0320 16:52:43.422652 543705 net.go:770] primary dev: ETH0
I0320 16:52:43.422665 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:52:43.422679 543705 net.go:698] Add success.
I0320 16:52:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:52:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:52:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:52:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:52:53.409783 543705 memory.go:184] no items to output this cycle
I0320 16:52:53.409783 543705 cpu.go:275] no items to output this cycle
E0320 16:53:03.409848 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:53:03.409866 543705 memory.go:184] no items to output this cycle
I0320 16:53:03.409976 543705 cpu.go:275] no items to output this cycle
E0320 16:53:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:53:13.409791 543705 memory.go:191] Add success.
I0320 16:53:13.409792 543705 cpu.go:282] Add success.
W0320 16:53:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:53:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:53:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:53:13.420173 543705 net.go:648] Add success.
I0320 16:53:13.423328 543705 net.go:770] primary dev: ETH0
I0320 16:53:13.423343 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:53:13.423357 543705 net.go:698] Add success.
I0320 16:53:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:53:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:53:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0320 16:53:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:53:14.456500 543705 disk_worker.go:494] system disk:vda1
I0320 16:53:14.456542 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:53:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:53:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:53:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:53:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:53:16.472380 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:53:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:53:23.409778 543705 memory.go:184] no items to output this cycle
I0320 16:53:23.409778 543705 cpu.go:275] no items to output this cycle
I0320 16:53:25.253674 543705 disk_info.go:125] begin check local disk info of client
I0320 16:53:25.256101 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:53:25.256107 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c47c0 0xc0000c4800]
E0320 16:53:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:53:33.409772 543705 memory.go:184] no items to output this cycle
I0320 16:53:33.409791 543705 cpu.go:275] no items to output this cycle
E0320 16:53:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:53:43.409820 543705 memory.go:191] Add success.
I0320 16:53:43.409829 543705 cpu.go:282] Add success.
I0320 16:53:43.419968 543705 net.go:648] Add success.
I0320 16:53:43.422685 543705 net.go:770] primary dev: ETH0
I0320 16:53:43.422698 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:53:43.422711 543705 net.go:698] Add success.
I0320 16:53:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:53:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:53:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:53:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:53:53.409805 543705 memory.go:184] no items to output this cycle
I0320 16:53:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 16:54:03.409900 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:54:03.409913 543705 cpu.go:275] no items to output this cycle
I0320 16:54:03.409919 543705 memory.go:184] no items to output this cycle
E0320 16:54:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:54:13.409783 543705 memory.go:191] Add success.
I0320 16:54:13.409789 543705 cpu.go:282] Add success.
W0320 16:54:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:54:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:54:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:54:13.420296 543705 net.go:648] Add success.
I0320 16:54:13.422933 543705 net.go:770] primary dev: ETH0
I0320 16:54:13.422945 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:54:13.422958 543705 net.go:698] Add success.
I0320 16:54:13.464183 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2a946051-42ab-423a-8550-c9726bdbe977","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 16:54:13.464215 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 16:54:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:54:14.455187 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:54:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 16:54:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:54:14.456683 543705 disk_worker.go:494] system disk:vda1
I0320 16:54:14.456722 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:54:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:54:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:54:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:54:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:54:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:54:23.410356 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:54:23.410372 543705 memory.go:184] no items to output this cycle
I0320 16:54:23.410402 543705 cpu.go:275] no items to output this cycle
I0320 16:54:25.257669 543705 disk_info.go:125] begin check local disk info of client
I0320 16:54:25.260103 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:54:25.260108 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b780 0xc00007b7c0]
E0320 16:54:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:54:33.409777 543705 cpu.go:275] no items to output this cycle
I0320 16:54:33.409782 543705 memory.go:184] no items to output this cycle
I0320 16:54:38.556544 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 16:54:38.556551 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 16:54:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:54:43.410660 543705 memory.go:191] Add success.
I0320 16:54:43.409816 543705 cpu.go:282] Add success.
I0320 16:54:43.420359 543705 net.go:648] Add success.
I0320 16:54:43.423174 543705 net.go:770] primary dev: ETH0
I0320 16:54:43.423187 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:54:43.423199 543705 net.go:698] Add success.
I0320 16:54:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:54:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:54:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:54:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:54:53.409772 543705 memory.go:184] no items to output this cycle
I0320 16:54:53.409782 543705 cpu.go:275] no items to output this cycle
E0320 16:55:03.409861 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:55:03.409891 543705 memory.go:184] no items to output this cycle
I0320 16:55:03.410057 543705 cpu.go:275] no items to output this cycle
E0320 16:55:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:55:13.409823 543705 memory.go:191] Add success.
I0320 16:55:13.409825 543705 cpu.go:282] Add success.
W0320 16:55:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:55:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:55:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:55:13.420238 543705 net.go:648] Add success.
I0320 16:55:13.423038 543705 net.go:770] primary dev: ETH0
I0320 16:55:13.423051 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:55:13.423063 543705 net.go:698] Add success.
I0320 16:55:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:55:14.455196 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:55:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0320 16:55:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:55:14.456589 543705 disk_worker.go:494] system disk:vda1
I0320 16:55:14.456618 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:55:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:55:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:55:16.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:55:16.458090 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:55:16.472445 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:55:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:55:23.409767 543705 memory.go:184] no items to output this cycle
I0320 16:55:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 16:55:25.261672 543705 disk_info.go:125] begin check local disk info of client
I0320 16:55:25.264112 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:55:25.264118 543705 disk_info.go:196] parse disk info done, disk is : [0xc000471d00 0xc000471d40]
E0320 16:55:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:55:33.409774 543705 memory.go:184] no items to output this cycle
I0320 16:55:33.409780 543705 cpu.go:275] no items to output this cycle
E0320 16:55:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:55:43.409806 543705 cpu.go:282] Add success.
I0320 16:55:43.409808 543705 memory.go:191] Add success.
I0320 16:55:43.420048 543705 net.go:648] Add success.
I0320 16:55:43.422685 543705 net.go:770] primary dev: ETH0
I0320 16:55:43.422701 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:55:43.422717 543705 net.go:698] Add success.
I0320 16:55:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:55:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:55:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:55:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:55:53.409802 543705 memory.go:184] no items to output this cycle
I0320 16:55:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 16:56:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:56:03.409779 543705 memory.go:184] no items to output this cycle
I0320 16:56:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 16:56:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:56:13.409797 543705 memory.go:191] Add success.
I0320 16:56:13.409799 543705 cpu.go:282] Add success.
W0320 16:56:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:56:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:56:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:56:13.420042 543705 net.go:648] Add success.
I0320 16:56:13.422859 543705 net.go:770] primary dev: ETH0
I0320 16:56:13.422872 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:56:13.422884 543705 net.go:698] Add success.
I0320 16:56:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:56:14.455100 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:56:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0320 16:56:14.455161 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:56:14.456496 543705 disk_worker.go:494] system disk:vda1
I0320 16:56:14.456542 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:56:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:56:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:56:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:56:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:56:16.472438 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:56:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:56:23.409776 543705 memory.go:184] no items to output this cycle
I0320 16:56:23.409786 543705 cpu.go:275] no items to output this cycle
I0320 16:56:25.265671 543705 disk_info.go:125] begin check local disk info of client
I0320 16:56:25.268076 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:56:25.268082 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aae40 0xc0001aae80]
E0320 16:56:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:56:33.409782 543705 memory.go:184] no items to output this cycle
I0320 16:56:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 16:56:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:56:43.409821 543705 memory.go:191] Add success.
I0320 16:56:43.409830 543705 cpu.go:282] Add success.
I0320 16:56:43.420009 543705 net.go:648] Add success.
I0320 16:56:43.422958 543705 net.go:770] primary dev: ETH0
I0320 16:56:43.422970 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:56:43.422987 543705 net.go:698] Add success.
I0320 16:56:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:56:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:56:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:56:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:56:53.409773 543705 memory.go:184] no items to output this cycle
I0320 16:56:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 16:57:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:57:03.409795 543705 memory.go:184] no items to output this cycle
I0320 16:57:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 16:57:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:57:13.409787 543705 memory.go:191] Add success.
I0320 16:57:13.409808 543705 cpu.go:282] Add success.
W0320 16:57:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:57:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:57:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:57:13.419734 543705 net.go:648] Add success.
I0320 16:57:13.422773 543705 net.go:770] primary dev: ETH0
I0320 16:57:13.422786 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:57:13.422798 543705 net.go:698] Add success.
I0320 16:57:13.429360 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 16:57:13.453538 543705 event_worker.go:152] Polling the log file for events...
I0320 16:57:13.464004 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"98242cf6-28dd-4d92-9616-ff49e9a2cbe7","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 16:57:13.464034 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 16:57:14.455098 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:57:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0320 16:57:14.455164 543705 disk_worker.go:728] disk inode is not compliant
E0320 16:57:14.456941 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 16:57:14.456950 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 16:57:14.456956 543705 custom_config.go:64] query custom config with name: gpu
I0320 16:57:14.456989 543705 disk_worker.go:494] system disk:vda1
I0320 16:57:14.457017 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 16:57:15.456864 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 16:57:15.456873 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:57:16.457942 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 16:57:16.457952 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 16:57:16.457995 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:57:16.458011 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:57:16.472337 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:57:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:57:23.409776 543705 memory.go:184] no items to output this cycle
I0320 16:57:23.409779 543705 cpu.go:275] no items to output this cycle
I0320 16:57:25.269670 543705 disk_info.go:125] begin check local disk info of client
I0320 16:57:25.272189 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:57:25.272196 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004eb300 0xc0004eb340]
E0320 16:57:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:57:33.409775 543705 memory.go:184] no items to output this cycle
I0320 16:57:33.409783 543705 cpu.go:275] no items to output this cycle
I0320 16:57:38.557526 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 16:57:38.557531 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 16:57:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:57:43.410792 543705 memory.go:191] Add success.
I0320 16:57:43.409822 543705 cpu.go:282] Add success.
I0320 16:57:43.420503 543705 net.go:648] Add success.
I0320 16:57:43.423737 543705 net.go:770] primary dev: ETH0
I0320 16:57:43.423752 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:57:43.423765 543705 net.go:698] Add success.
I0320 16:57:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:57:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:57:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:57:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:57:53.409802 543705 memory.go:184] no items to output this cycle
I0320 16:57:53.409818 543705 cpu.go:275] no items to output this cycle
E0320 16:58:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:58:03.409788 543705 memory.go:184] no items to output this cycle
I0320 16:58:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 16:58:13.409865 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:58:13.409904 543705 memory.go:191] Add success.
W0320 16:58:13.410023 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:58:13.410042 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:58:13.410047 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:58:13.410063 543705 cpu.go:282] Add success.
I0320 16:58:13.419710 543705 net.go:648] Add success.
I0320 16:58:13.422372 543705 net.go:770] primary dev: ETH0
I0320 16:58:13.422385 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:58:13.422396 543705 net.go:698] Add success.
I0320 16:58:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:58:14.455103 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:58:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0320 16:58:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:58:14.456568 543705 disk_worker.go:494] system disk:vda1
I0320 16:58:14.456598 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:58:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:58:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:58:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:58:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:58:16.472404 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:58:23.410255 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:58:23.410272 543705 memory.go:184] no items to output this cycle
I0320 16:58:23.410284 543705 cpu.go:275] no items to output this cycle
I0320 16:58:25.273674 543705 disk_info.go:125] begin check local disk info of client
I0320 16:58:25.276130 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:58:25.276136 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007be80 0xc00007bec0]
E0320 16:58:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:58:33.409774 543705 memory.go:184] no items to output this cycle
I0320 16:58:33.409800 543705 cpu.go:275] no items to output this cycle
E0320 16:58:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:58:43.409794 543705 memory.go:191] Add success.
I0320 16:58:43.409800 543705 cpu.go:282] Add success.
I0320 16:58:43.419865 543705 net.go:648] Add success.
I0320 16:58:43.422997 543705 net.go:770] primary dev: ETH0
I0320 16:58:43.423013 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:58:43.423031 543705 net.go:698] Add success.
I0320 16:58:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:58:46.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:58:46.458056 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:58:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:58:53.409763 543705 memory.go:184] no items to output this cycle
I0320 16:58:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 16:59:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:59:03.409773 543705 memory.go:184] no items to output this cycle
I0320 16:59:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 16:59:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:59:13.409806 543705 memory.go:191] Add success.
I0320 16:59:13.409820 543705 cpu.go:282] Add success.
W0320 16:59:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 16:59:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 16:59:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 16:59:13.419741 543705 net.go:648] Add success.
I0320 16:59:13.423012 543705 net.go:770] primary dev: ETH0
I0320 16:59:13.423025 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:59:13.423036 543705 net.go:698] Add success.
I0320 16:59:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 16:59:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 16:59:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 16:59:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0320 16:59:14.456559 543705 disk_worker.go:494] system disk:vda1
I0320 16:59:14.456589 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 16:59:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 16:59:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:59:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:59:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 16:59:16.472373 543705 disk_local_worker.go:436] Get disk info: []
E0320 16:59:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:59:23.409779 543705 memory.go:184] no items to output this cycle
I0320 16:59:23.409781 543705 cpu.go:275] no items to output this cycle
I0320 16:59:25.277673 543705 disk_info.go:125] begin check local disk info of client
I0320 16:59:25.280097 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 16:59:25.280103 543705 disk_info.go:196] parse disk info done, disk is : [0xc000586100 0xc000586140]
E0320 16:59:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:59:33.409777 543705 cpu.go:275] no items to output this cycle
I0320 16:59:33.409784 543705 memory.go:184] no items to output this cycle
E0320 16:59:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:59:43.409822 543705 memory.go:191] Add success.
I0320 16:59:43.409826 543705 cpu.go:282] Add success.
I0320 16:59:43.419985 543705 net.go:648] Add success.
I0320 16:59:43.422928 543705 net.go:770] primary dev: ETH0
I0320 16:59:43.422943 543705 net.go:802] Send network stats successfully!,count is 6
I0320 16:59:43.422966 543705 net.go:698] Add success.
I0320 16:59:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 16:59:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 16:59:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0320 16:59:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 16:59:53.409800 543705 memory.go:184] no items to output this cycle
I0320 16:59:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 17:00:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:00:03.409790 543705 memory.go:184] no items to output this cycle
I0320 17:00:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 17:00:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:00:13.409784 543705 memory.go:191] Add success.
W0320 17:00:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 17:00:13.409810 543705 cpu.go:282] Add success.
W0320 17:00:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:00:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:00:13.420306 543705 net.go:648] Add success.
I0320 17:00:13.423293 543705 net.go:770] primary dev: ETH0
I0320 17:00:13.423307 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:00:13.423321 543705 net.go:698] Add success.
I0320 17:00:13.684815 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"cc380024-9354-4c09-94b9-f2268cf2a833","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 17:00:13.684847 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 17:00:14.454719 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:00:14.454917 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:00:14.454927 543705 disk_worker.go:708] disk space is not compliant
W0320 17:00:14.454930 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:00:14.456285 543705 disk_worker.go:494] system disk:vda1
I0320 17:00:14.456328 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:00:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:00:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:00:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:00:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:00:16.472385 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:00:23.410375 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:00:23.410394 543705 memory.go:184] no items to output this cycle
I0320 17:00:23.410404 543705 cpu.go:275] no items to output this cycle
I0320 17:00:25.281671 543705 disk_info.go:125] begin check local disk info of client
I0320 17:00:25.284123 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:00:25.284128 543705 disk_info.go:196] parse disk info done, disk is : [0xc000390a00 0xc000390a40]
E0320 17:00:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:00:33.409779 543705 memory.go:184] no items to output this cycle
I0320 17:00:33.409784 543705 cpu.go:275] no items to output this cycle
I0320 17:00:38.558556 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 17:00:38.558563 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 17:00:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:00:43.410685 543705 memory.go:191] Add success.
I0320 17:00:43.409835 543705 cpu.go:282] Add success.
I0320 17:00:43.420404 543705 net.go:648] Add success.
I0320 17:00:43.423376 543705 net.go:770] primary dev: ETH0
I0320 17:00:43.423389 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:00:43.423401 543705 net.go:698] Add success.
I0320 17:00:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:00:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:00:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:00:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:00:53.409795 543705 memory.go:184] no items to output this cycle
I0320 17:00:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 17:01:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:01:03.409782 543705 memory.go:184] no items to output this cycle
I0320 17:01:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 17:01:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:01:13.409783 543705 memory.go:191] Add success.
I0320 17:01:13.409801 543705 cpu.go:282] Add success.
W0320 17:01:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:01:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:01:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:01:13.419719 543705 net.go:648] Add success.
I0320 17:01:13.422509 543705 net.go:770] primary dev: ETH0
I0320 17:01:13.422524 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:01:13.422537 543705 net.go:698] Add success.
I0320 17:01:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:01:14.455124 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:01:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 17:01:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:01:14.456573 543705 disk_worker.go:494] system disk:vda1
I0320 17:01:14.456601 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:01:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:01:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:01:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:01:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:01:16.472410 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:01:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:01:23.409776 543705 cpu.go:275] no items to output this cycle
I0320 17:01:23.409778 543705 memory.go:184] no items to output this cycle
I0320 17:01:25.285672 543705 disk_info.go:125] begin check local disk info of client
I0320 17:01:25.288086 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:01:25.288092 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003d35c0 0xc0003d3600]
E0320 17:01:33.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:01:33.409761 543705 memory.go:184] no items to output this cycle
I0320 17:01:33.409797 543705 cpu.go:275] no items to output this cycle
E0320 17:01:43.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:01:43.409830 543705 memory.go:191] Add success.
I0320 17:01:43.409834 543705 cpu.go:282] Add success.
I0320 17:01:43.419959 543705 net.go:648] Add success.
I0320 17:01:43.423238 543705 net.go:770] primary dev: ETH0
I0320 17:01:43.423252 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:01:43.423266 543705 net.go:698] Add success.
I0320 17:01:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:01:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:01:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:01:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:01:53.409800 543705 memory.go:184] no items to output this cycle
I0320 17:01:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 17:02:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:02:03.409777 543705 memory.go:184] no items to output this cycle
I0320 17:02:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 17:02:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:02:13.409806 543705 memory.go:191] Add success.
I0320 17:02:13.409814 543705 cpu.go:282] Add success.
W0320 17:02:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:02:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:02:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:02:13.419710 543705 net.go:648] Add success.
I0320 17:02:13.422685 543705 net.go:770] primary dev: ETH0
I0320 17:02:13.422702 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:02:13.422713 543705 net.go:698] Add success.
W0320 17:02:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:02:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 17:02:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:02:14.456749 543705 disk_worker.go:494] system disk:vda1
I0320 17:02:14.456788 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 17:02:14.457184 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 17:02:14.457192 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 17:02:14.457196 543705 custom_config.go:64] query custom config with name: gpu
E0320 17:02:15.456803 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 17:02:15.456812 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:02:16.457941 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 17:02:16.457941 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 17:02:16.457997 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:02:16.458018 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:02:16.472338 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:02:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:02:23.409792 543705 memory.go:184] no items to output this cycle
I0320 17:02:23.409802 543705 cpu.go:275] no items to output this cycle
I0320 17:02:25.289683 543705 disk_info.go:125] begin check local disk info of client
I0320 17:02:25.292245 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:02:25.292253 543705 disk_info.go:196] parse disk info done, disk is : [0xc000587300 0xc000587340]
E0320 17:02:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:02:33.409775 543705 memory.go:184] no items to output this cycle
I0320 17:02:33.409797 543705 cpu.go:275] no items to output this cycle
E0320 17:02:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:02:43.409792 543705 memory.go:191] Add success.
I0320 17:02:43.409815 543705 cpu.go:282] Add success.
I0320 17:02:43.419870 543705 net.go:648] Add success.
I0320 17:02:43.422711 543705 net.go:770] primary dev: ETH0
I0320 17:02:43.422727 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:02:43.422743 543705 net.go:698] Add success.
I0320 17:02:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:02:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:02:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:02:53.410353 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:02:53.410366 543705 cpu.go:275] no items to output this cycle
I0320 17:02:53.410368 543705 memory.go:184] no items to output this cycle
E0320 17:03:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:03:03.409801 543705 memory.go:184] no items to output this cycle
I0320 17:03:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 17:03:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:03:13.409815 543705 memory.go:191] Add success.
I0320 17:03:13.409820 543705 cpu.go:282] Add success.
W0320 17:03:13.409978 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:03:13.410001 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:03:13.410006 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:03:13.419703 543705 net.go:648] Add success.
I0320 17:03:13.422848 543705 net.go:770] primary dev: ETH0
I0320 17:03:13.422864 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:03:13.422878 543705 net.go:698] Add success.
I0320 17:03:13.464054 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5320d097-6613-4d08-99bb-dfa5b80eb94c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 17:03:13.464085 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 17:03:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:03:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:03:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0320 17:03:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:03:14.456601 543705 disk_worker.go:494] system disk:vda1
I0320 17:03:14.456630 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:03:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:03:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:03:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:03:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:03:16.472433 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:03:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:03:23.409762 543705 memory.go:184] no items to output this cycle
I0320 17:03:23.409799 543705 cpu.go:275] no items to output this cycle
I0320 17:03:25.293670 543705 disk_info.go:125] begin check local disk info of client
I0320 17:03:25.296108 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:03:25.296114 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004cb340 0xc0004cb380]
E0320 17:03:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:03:33.409797 543705 memory.go:184] no items to output this cycle
I0320 17:03:33.409808 543705 cpu.go:275] no items to output this cycle
I0320 17:03:38.559545 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 17:03:38.559552 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 17:03:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:03:43.410573 543705 memory.go:191] Add success.
I0320 17:03:43.409818 543705 cpu.go:282] Add success.
I0320 17:03:43.420326 543705 net.go:648] Add success.
I0320 17:03:43.423315 543705 net.go:770] primary dev: ETH0
I0320 17:03:43.423328 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:03:43.423343 543705 net.go:698] Add success.
I0320 17:03:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:03:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:03:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:03:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:03:53.409795 543705 memory.go:184] no items to output this cycle
I0320 17:03:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 17:04:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:04:03.409786 543705 memory.go:184] no items to output this cycle
I0320 17:04:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 17:04:13.409882 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:04:13.409897 543705 cpu.go:282] Add success.
I0320 17:04:13.409906 543705 memory.go:191] Add success.
W0320 17:04:13.409941 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:04:13.409960 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:04:13.409969 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:04:13.419725 543705 net.go:648] Add success.
I0320 17:04:13.422616 543705 net.go:770] primary dev: ETH0
I0320 17:04:13.422629 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:04:13.422640 543705 net.go:698] Add success.
I0320 17:04:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:04:14.455108 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:04:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 17:04:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:04:14.456536 543705 disk_worker.go:494] system disk:vda1
I0320 17:04:14.456579 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:04:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:04:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:04:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:04:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:04:16.472402 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:04:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:04:23.409792 543705 memory.go:184] no items to output this cycle
I0320 17:04:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 17:04:25.297674 543705 disk_info.go:125] begin check local disk info of client
I0320 17:04:25.300175 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:04:25.300180 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007ab80 0xc00007abc0]
E0320 17:04:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:04:33.409776 543705 memory.go:184] no items to output this cycle
I0320 17:04:33.409781 543705 cpu.go:275] no items to output this cycle
E0320 17:04:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:04:43.409800 543705 memory.go:191] Add success.
I0320 17:04:43.409801 543705 cpu.go:282] Add success.
I0320 17:04:43.420015 543705 net.go:648] Add success.
I0320 17:04:43.423086 543705 net.go:770] primary dev: ETH0
I0320 17:04:43.423100 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:04:43.423113 543705 net.go:698] Add success.
I0320 17:04:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:04:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:04:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:04:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:04:53.409779 543705 memory.go:184] no items to output this cycle
I0320 17:04:53.409780 543705 cpu.go:275] no items to output this cycle
E0320 17:05:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:05:03.409784 543705 memory.go:184] no items to output this cycle
I0320 17:05:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 17:05:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:05:13.409787 543705 memory.go:191] Add success.
I0320 17:05:13.409786 543705 cpu.go:282] Add success.
W0320 17:05:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:05:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:05:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:05:13.420254 543705 net.go:648] Add success.
I0320 17:05:13.423156 543705 net.go:770] primary dev: ETH0
I0320 17:05:13.423170 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:05:13.423181 543705 net.go:698] Add success.
I0320 17:05:14.454952 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:05:14.455131 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:05:14.455142 543705 disk_worker.go:708] disk space is not compliant
W0320 17:05:14.455144 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:05:14.456476 543705 disk_worker.go:494] system disk:vda1
I0320 17:05:14.456518 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:05:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:05:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:05:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:05:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:05:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:05:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:05:23.409779 543705 memory.go:184] no items to output this cycle
I0320 17:05:23.409778 543705 cpu.go:275] no items to output this cycle
I0320 17:05:25.302968 543705 disk_info.go:125] begin check local disk info of client
I0320 17:05:25.305423 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:05:25.305441 543705 disk_info.go:196] parse disk info done, disk is : [0xc000340840 0xc000340880]
E0320 17:05:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:05:33.409775 543705 memory.go:184] no items to output this cycle
I0320 17:05:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 17:05:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:05:43.409806 543705 memory.go:191] Add success.
I0320 17:05:43.409807 543705 cpu.go:282] Add success.
I0320 17:05:43.420011 543705 net.go:648] Add success.
I0320 17:05:43.422835 543705 net.go:770] primary dev: ETH0
I0320 17:05:43.422848 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:05:43.422861 543705 net.go:698] Add success.
I0320 17:05:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:05:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:05:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:05:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:05:53.409790 543705 memory.go:184] no items to output this cycle
I0320 17:05:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 17:06:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:06:03.409778 543705 memory.go:184] no items to output this cycle
I0320 17:06:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 17:06:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:06:13.409783 543705 memory.go:191] Add success.
I0320 17:06:13.409800 543705 cpu.go:282] Add success.
W0320 17:06:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:06:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:06:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:06:13.420333 543705 net.go:648] Add success.
I0320 17:06:13.423625 543705 net.go:770] primary dev: ETH0
I0320 17:06:13.423638 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:06:13.423649 543705 net.go:698] Add success.
I0320 17:06:13.470249 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f2661989-bc19-4e5c-ba4b-bfaa76caa3b1","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 17:06:13.470280 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 17:06:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:06:14.455180 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:06:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0320 17:06:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:06:14.456689 543705 disk_worker.go:494] system disk:vda1
I0320 17:06:14.456718 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:06:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:06:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:06:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:06:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:06:16.472378 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:06:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:06:23.409775 543705 memory.go:184] no items to output this cycle
I0320 17:06:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 17:06:25.305669 543705 disk_info.go:125] begin check local disk info of client
I0320 17:06:25.308139 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:06:25.308145 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a8940 0xc0004a8980]
E0320 17:06:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:06:33.409784 543705 memory.go:184] no items to output this cycle
I0320 17:06:33.409800 543705 cpu.go:275] no items to output this cycle
I0320 17:06:38.560564 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 17:06:38.560570 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 17:06:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:06:43.410644 543705 memory.go:191] Add success.
I0320 17:06:43.409832 543705 cpu.go:282] Add success.
I0320 17:06:43.420370 543705 net.go:648] Add success.
I0320 17:06:43.422875 543705 net.go:770] primary dev: ETH0
I0320 17:06:43.422891 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:06:43.422905 543705 net.go:698] Add success.
I0320 17:06:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:06:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:06:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:06:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:06:53.409793 543705 memory.go:184] no items to output this cycle
I0320 17:06:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 17:07:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:07:03.409788 543705 memory.go:184] no items to output this cycle
I0320 17:07:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 17:07:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:07:13.409790 543705 memory.go:191] Add success.
I0320 17:07:13.409791 543705 cpu.go:282] Add success.
W0320 17:07:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:07:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:07:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:07:13.420070 543705 net.go:648] Add success.
I0320 17:07:13.423303 543705 net.go:770] primary dev: ETH0
I0320 17:07:13.423319 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:07:13.423333 543705 net.go:698] Add success.
I0320 17:07:13.452838 543705 event_worker.go:152] Polling the log file for events...
W0320 17:07:14.456362 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:07:14.456375 543705 disk_worker.go:708] disk space is not compliant
W0320 17:07:14.456379 543705 disk_worker.go:728] disk inode is not compliant
E0320 17:07:14.457107 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 17:07:14.457116 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 17:07:14.457122 543705 custom_config.go:64] query custom config with name: gpu
I0320 17:07:14.457996 543705 disk_worker.go:494] system disk:vda1
I0320 17:07:14.458022 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 17:07:15.456792 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 17:07:15.456801 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:07:16.457938 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 17:07:16.457937 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 17:07:16.457992 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:07:16.458011 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:07:16.472361 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:07:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:07:23.409769 543705 memory.go:184] no items to output this cycle
I0320 17:07:23.409811 543705 cpu.go:275] no items to output this cycle
I0320 17:07:25.309672 543705 disk_info.go:125] begin check local disk info of client
I0320 17:07:25.312101 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:07:25.312107 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fbec0 0xc0001fbf00]
E0320 17:07:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:07:33.409782 543705 memory.go:184] no items to output this cycle
I0320 17:07:33.409794 543705 cpu.go:275] no items to output this cycle
E0320 17:07:43.410023 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:07:43.410052 543705 memory.go:191] Add success.
I0320 17:07:43.410083 543705 cpu.go:282] Add success.
I0320 17:07:43.420337 543705 net.go:648] Add success.
I0320 17:07:43.423126 543705 net.go:770] primary dev: ETH0
I0320 17:07:43.423139 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:07:43.423151 543705 net.go:698] Add success.
I0320 17:07:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:07:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:07:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:07:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:07:53.409766 543705 memory.go:184] no items to output this cycle
I0320 17:07:53.409784 543705 cpu.go:275] no items to output this cycle
E0320 17:08:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:08:03.409786 543705 memory.go:184] no items to output this cycle
I0320 17:08:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 17:08:13.409807 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:08:13.409815 543705 cpu.go:282] Add success.
I0320 17:08:13.409830 543705 memory.go:191] Add success.
W0320 17:08:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:08:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:08:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:08:13.420058 543705 net.go:648] Add success.
I0320 17:08:13.422833 543705 net.go:770] primary dev: ETH0
I0320 17:08:13.422849 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:08:13.422877 543705 net.go:698] Add success.
I0320 17:08:14.454988 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:08:14.455293 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:08:14.455391 543705 disk_worker.go:708] disk space is not compliant
W0320 17:08:14.455412 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:08:14.457016 543705 disk_worker.go:494] system disk:vda1
I0320 17:08:14.457046 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:08:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:08:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:08:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:08:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:08:16.472399 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:08:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:08:23.409787 543705 memory.go:184] no items to output this cycle
I0320 17:08:23.409788 543705 cpu.go:275] no items to output this cycle
I0320 17:08:25.313683 543705 disk_info.go:125] begin check local disk info of client
I0320 17:08:25.316124 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:08:25.316130 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b7c0 0xc00007b800]
E0320 17:08:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:08:33.409801 543705 cpu.go:275] no items to output this cycle
I0320 17:08:33.409805 543705 memory.go:184] no items to output this cycle
E0320 17:08:43.409802 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:08:43.409836 543705 memory.go:191] Add success.
I0320 17:08:43.409843 543705 cpu.go:282] Add success.
I0320 17:08:43.420008 543705 net.go:648] Add success.
I0320 17:08:43.422736 543705 net.go:770] primary dev: ETH0
I0320 17:08:43.422750 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:08:43.422763 543705 net.go:698] Add success.
I0320 17:08:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:08:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:08:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:08:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:08:53.409789 543705 memory.go:184] no items to output this cycle
I0320 17:08:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 17:09:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:09:03.409791 543705 memory.go:184] no items to output this cycle
I0320 17:09:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 17:09:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:09:13.409802 543705 memory.go:191] Add success.
I0320 17:09:13.409804 543705 cpu.go:282] Add success.
W0320 17:09:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:09:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:09:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:09:13.420043 543705 net.go:648] Add success.
I0320 17:09:13.422912 543705 net.go:770] primary dev: ETH0
I0320 17:09:13.422925 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:09:13.422937 543705 net.go:698] Add success.
I0320 17:09:13.472937 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0982ed41-24d2-4bb6-91a6-09e485776676","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 17:09:13.472972 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 17:09:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:09:14.455147 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:09:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0320 17:09:14.455160 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:09:14.456806 543705 disk_worker.go:494] system disk:vda1
I0320 17:09:14.456847 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:09:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:09:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:09:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:09:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:09:16.472384 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:09:23.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:09:23.409816 543705 memory.go:184] no items to output this cycle
I0320 17:09:23.409827 543705 cpu.go:275] no items to output this cycle
I0320 17:09:25.317675 543705 disk_info.go:125] begin check local disk info of client
I0320 17:09:25.320135 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:09:25.320141 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002be200 0xc0002be240]
E0320 17:09:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:09:33.409795 543705 cpu.go:275] no items to output this cycle
I0320 17:09:33.409796 543705 memory.go:184] no items to output this cycle
I0320 17:09:38.561566 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 17:09:38.561573 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 17:09:43.409802 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:09:43.410641 543705 memory.go:191] Add success.
I0320 17:09:43.409848 543705 cpu.go:282] Add success.
I0320 17:09:43.420338 543705 net.go:648] Add success.
I0320 17:09:43.422869 543705 net.go:770] primary dev: ETH0
I0320 17:09:43.422882 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:09:43.422894 543705 net.go:698] Add success.
I0320 17:09:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:09:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:09:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:09:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:09:53.409776 543705 memory.go:184] no items to output this cycle
I0320 17:09:53.409819 543705 cpu.go:275] no items to output this cycle
E0320 17:10:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:10:03.409780 543705 memory.go:184] no items to output this cycle
I0320 17:10:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 17:10:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:10:13.409827 543705 memory.go:191] Add success.
I0320 17:10:13.409835 543705 cpu.go:282] Add success.
W0320 17:10:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:10:13.409877 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:10:13.409881 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:10:13.420151 543705 net.go:648] Add success.
I0320 17:10:13.422680 543705 net.go:770] primary dev: ETH0
I0320 17:10:13.422693 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:10:13.422705 543705 net.go:698] Add success.
I0320 17:10:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:10:14.455145 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:10:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0320 17:10:14.455160 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:10:14.456474 543705 disk_worker.go:494] system disk:vda1
I0320 17:10:14.456503 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:10:15.454989 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:10:16.458036 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:10:16.458097 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:10:16.458119 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:10:16.472480 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:10:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:10:23.409786 543705 memory.go:184] no items to output this cycle
I0320 17:10:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 17:10:25.321674 543705 disk_info.go:125] begin check local disk info of client
I0320 17:10:25.324136 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:10:25.324142 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002be780 0xc0002be7c0]
E0320 17:10:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:10:33.409784 543705 memory.go:184] no items to output this cycle
I0320 17:10:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 17:10:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:10:43.409809 543705 memory.go:191] Add success.
I0320 17:10:43.409827 543705 cpu.go:282] Add success.
I0320 17:10:43.419926 543705 net.go:648] Add success.
I0320 17:10:43.422652 543705 net.go:770] primary dev: ETH0
I0320 17:10:43.422666 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:10:43.422679 543705 net.go:698] Add success.
I0320 17:10:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:10:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:10:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:10:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:10:53.409766 543705 memory.go:184] no items to output this cycle
I0320 17:10:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 17:11:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:11:03.409777 543705 memory.go:184] no items to output this cycle
I0320 17:11:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 17:11:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:11:13.409809 543705 memory.go:191] Add success.
I0320 17:11:13.409819 543705 cpu.go:282] Add success.
W0320 17:11:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:11:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:11:13.409856 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:11:13.420114 543705 net.go:648] Add success.
I0320 17:11:13.422705 543705 net.go:770] primary dev: ETH0
I0320 17:11:13.422718 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:11:13.422729 543705 net.go:698] Add success.
I0320 17:11:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:11:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:11:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 17:11:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:11:14.456568 543705 disk_worker.go:494] system disk:vda1
I0320 17:11:14.456600 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:11:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:11:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:11:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:11:16.458049 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:11:16.472373 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:11:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:11:23.409771 543705 memory.go:184] no items to output this cycle
I0320 17:11:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 17:11:25.325674 543705 disk_info.go:125] begin check local disk info of client
I0320 17:11:25.328112 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:11:25.328118 543705 disk_info.go:196] parse disk info done, disk is : [0xc000472740 0xc000472780]
E0320 17:11:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:11:33.409785 543705 memory.go:184] no items to output this cycle
I0320 17:11:33.409804 543705 cpu.go:275] no items to output this cycle
E0320 17:11:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:11:43.409798 543705 memory.go:191] Add success.
I0320 17:11:43.409816 543705 cpu.go:282] Add success.
I0320 17:11:43.420008 543705 net.go:648] Add success.
I0320 17:11:43.423247 543705 net.go:770] primary dev: ETH0
I0320 17:11:43.423260 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:11:43.423274 543705 net.go:698] Add success.
I0320 17:11:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:11:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:11:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:11:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:11:53.409792 543705 memory.go:184] no items to output this cycle
I0320 17:11:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 17:12:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:12:03.409777 543705 memory.go:184] no items to output this cycle
I0320 17:12:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 17:12:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:12:13.409807 543705 memory.go:191] Add success.
I0320 17:12:13.409816 543705 cpu.go:282] Add success.
W0320 17:12:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:12:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:12:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:12:13.420269 543705 net.go:648] Add success.
I0320 17:12:13.423094 543705 net.go:770] primary dev: ETH0
I0320 17:12:13.423109 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:12:13.423123 543705 net.go:698] Add success.
I0320 17:12:13.581250 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"78e03e9e-ea4b-421c-b300-6d87529a5637","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 17:12:13.581284 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 17:12:14.455226 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:12:14.455240 543705 disk_worker.go:708] disk space is not compliant
W0320 17:12:14.455244 543705 disk_worker.go:728] disk inode is not compliant
E0320 17:12:14.456079 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 17:12:14.456088 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 17:12:14.456094 543705 custom_config.go:64] query custom config with name: gpu
I0320 17:12:14.457047 543705 disk_worker.go:494] system disk:vda1
I0320 17:12:14.457087 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 17:12:15.456803 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 17:12:15.456811 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:12:16.457915 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 17:12:16.457915 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 17:12:16.457972 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:12:16.458004 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:12:16.472335 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:12:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:12:23.409793 543705 memory.go:184] no items to output this cycle
I0320 17:12:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 17:12:25.329680 543705 disk_info.go:125] begin check local disk info of client
I0320 17:12:25.332111 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:12:25.332117 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048a000 0xc00048a040]
E0320 17:12:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:12:33.409791 543705 cpu.go:275] no items to output this cycle
I0320 17:12:33.409798 543705 memory.go:184] no items to output this cycle
I0320 17:12:38.562570 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 17:12:38.562577 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 17:12:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:12:43.410818 543705 memory.go:191] Add success.
I0320 17:12:43.409834 543705 cpu.go:282] Add success.
I0320 17:12:43.420535 543705 net.go:648] Add success.
I0320 17:12:43.423613 543705 net.go:770] primary dev: ETH0
I0320 17:12:43.423626 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:12:43.423638 543705 net.go:698] Add success.
I0320 17:12:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:12:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:12:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:12:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:12:53.409774 543705 memory.go:184] no items to output this cycle
I0320 17:12:53.409782 543705 cpu.go:275] no items to output this cycle
E0320 17:13:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:13:03.409778 543705 memory.go:184] no items to output this cycle
I0320 17:13:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 17:13:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:13:13.409785 543705 memory.go:191] Add success.
I0320 17:13:13.409804 543705 cpu.go:282] Add success.
W0320 17:13:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:13:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:13:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:13:13.420062 543705 net.go:648] Add success.
I0320 17:13:13.422488 543705 net.go:770] primary dev: ETH0
I0320 17:13:13.422506 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:13:13.422520 543705 net.go:698] Add success.
I0320 17:13:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:13:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:13:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0320 17:13:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:13:14.456517 543705 disk_worker.go:494] system disk:vda1
I0320 17:13:14.456565 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:13:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:13:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:13:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:13:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:13:16.472390 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:13:23.409872 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:13:23.409881 543705 cpu.go:275] no items to output this cycle
I0320 17:13:23.409930 543705 memory.go:184] no items to output this cycle
I0320 17:13:25.333671 543705 disk_info.go:125] begin check local disk info of client
I0320 17:13:25.336191 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:13:25.336198 543705 disk_info.go:196] parse disk info done, disk is : [0xc000348000 0xc000348040]
E0320 17:13:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:13:33.409781 543705 memory.go:184] no items to output this cycle
I0320 17:13:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 17:13:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:13:43.409799 543705 memory.go:191] Add success.
I0320 17:13:43.409821 543705 cpu.go:282] Add success.
I0320 17:13:43.419981 543705 net.go:648] Add success.
I0320 17:13:43.422551 543705 net.go:770] primary dev: ETH0
I0320 17:13:43.422564 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:13:43.422577 543705 net.go:698] Add success.
I0320 17:13:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:13:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:13:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:13:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:13:53.409774 543705 memory.go:184] no items to output this cycle
I0320 17:13:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 17:14:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:14:03.409795 543705 cpu.go:275] no items to output this cycle
I0320 17:14:03.409798 543705 memory.go:184] no items to output this cycle
E0320 17:14:13.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:14:13.409785 543705 memory.go:191] Add success.
W0320 17:14:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 17:14:13.409818 543705 cpu.go:282] Add success.
W0320 17:14:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:14:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:14:13.420093 543705 net.go:648] Add success.
I0320 17:14:13.423293 543705 net.go:770] primary dev: ETH0
I0320 17:14:13.423309 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:14:13.423321 543705 net.go:698] Add success.
I0320 17:14:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:14:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:14:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 17:14:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:14:14.456574 543705 disk_worker.go:494] system disk:vda1
I0320 17:14:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:14:15.454995 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:14:16.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:14:16.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:14:16.458080 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:14:16.472424 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:14:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:14:23.409787 543705 cpu.go:275] no items to output this cycle
I0320 17:14:23.409797 543705 memory.go:184] no items to output this cycle
I0320 17:14:25.337688 543705 disk_info.go:125] begin check local disk info of client
I0320 17:14:25.340021 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:14:25.340028 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007adc0 0xc00007ae00]
E0320 17:14:33.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:14:33.409819 543705 memory.go:184] no items to output this cycle
I0320 17:14:33.409831 543705 cpu.go:275] no items to output this cycle
E0320 17:14:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:14:43.409815 543705 memory.go:191] Add success.
I0320 17:14:43.409817 543705 cpu.go:282] Add success.
I0320 17:14:43.420011 543705 net.go:648] Add success.
I0320 17:14:43.422819 543705 net.go:770] primary dev: ETH0
I0320 17:14:43.422835 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:14:43.422849 543705 net.go:698] Add success.
I0320 17:14:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:14:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:14:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:14:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:14:53.409771 543705 memory.go:184] no items to output this cycle
I0320 17:14:53.409776 543705 cpu.go:275] no items to output this cycle
E0320 17:15:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:15:03.409783 543705 memory.go:184] no items to output this cycle
I0320 17:15:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 17:15:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:15:13.409814 543705 memory.go:191] Add success.
I0320 17:15:13.409819 543705 cpu.go:282] Add success.
W0320 17:15:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:15:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:15:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:15:13.420098 543705 net.go:648] Add success.
I0320 17:15:13.423068 543705 net.go:770] primary dev: ETH0
I0320 17:15:13.423082 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:15:13.423093 543705 net.go:698] Add success.
I0320 17:15:13.514102 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"61d1750f-c590-4aa4-9456-2a89cb21e1bc","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 17:15:13.514136 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 17:15:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:15:14.455104 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:15:14.455164 543705 disk_worker.go:708] disk space is not compliant
W0320 17:15:14.455166 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:15:14.456499 543705 disk_worker.go:494] system disk:vda1
I0320 17:15:14.456544 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:15:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:15:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:15:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:15:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:15:16.472412 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:15:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:15:23.409772 543705 memory.go:184] no items to output this cycle
I0320 17:15:23.409802 543705 cpu.go:275] no items to output this cycle
I0320 17:15:25.341678 543705 disk_info.go:125] begin check local disk info of client
I0320 17:15:25.344098 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:15:25.344105 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa000 0xc0001aa040]
E0320 17:15:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:15:33.409783 543705 memory.go:184] no items to output this cycle
I0320 17:15:33.409784 543705 cpu.go:275] no items to output this cycle
I0320 17:15:38.563568 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 17:15:38.563574 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 17:15:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:15:43.410644 543705 memory.go:191] Add success.
I0320 17:15:43.409803 543705 cpu.go:282] Add success.
I0320 17:15:43.420308 543705 net.go:648] Add success.
I0320 17:15:43.422971 543705 net.go:770] primary dev: ETH0
I0320 17:15:43.422989 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:15:43.423004 543705 net.go:698] Add success.
I0320 17:15:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:15:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:15:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:15:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:15:53.409771 543705 memory.go:184] no items to output this cycle
I0320 17:15:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 17:16:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:16:03.409781 543705 memory.go:184] no items to output this cycle
I0320 17:16:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 17:16:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:16:13.409809 543705 memory.go:191] Add success.
I0320 17:16:13.409816 543705 cpu.go:282] Add success.
W0320 17:16:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:16:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:16:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:16:13.420328 543705 net.go:648] Add success.
I0320 17:16:13.423156 543705 net.go:770] primary dev: ETH0
I0320 17:16:13.423169 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:16:13.423182 543705 net.go:698] Add success.
I0320 17:16:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:16:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:16:14.455264 543705 disk_worker.go:708] disk space is not compliant
W0320 17:16:14.455268 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:16:14.457509 543705 disk_worker.go:494] system disk:vda1
I0320 17:16:14.457537 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:16:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:16:16.457997 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:16:16.458064 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:16:16.458086 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:16:16.472417 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:16:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:16:23.409774 543705 cpu.go:275] no items to output this cycle
I0320 17:16:23.409777 543705 memory.go:184] no items to output this cycle
I0320 17:16:25.345675 543705 disk_info.go:125] begin check local disk info of client
I0320 17:16:25.348200 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:16:25.348207 543705 disk_info.go:196] parse disk info done, disk is : [0xc000270d00 0xc000270d40]
E0320 17:16:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:16:33.409781 543705 memory.go:184] no items to output this cycle
I0320 17:16:33.409786 543705 cpu.go:275] no items to output this cycle
E0320 17:16:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:16:43.409795 543705 memory.go:191] Add success.
I0320 17:16:43.409830 543705 cpu.go:282] Add success.
I0320 17:16:43.420010 543705 net.go:648] Add success.
I0320 17:16:43.423233 543705 net.go:770] primary dev: ETH0
I0320 17:16:43.423247 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:16:43.423263 543705 net.go:698] Add success.
I0320 17:16:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:16:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:16:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:16:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:16:53.409766 543705 memory.go:184] no items to output this cycle
I0320 17:16:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 17:17:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:17:03.409775 543705 memory.go:184] no items to output this cycle
I0320 17:17:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 17:17:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:17:13.409806 543705 memory.go:191] Add success.
I0320 17:17:13.409814 543705 cpu.go:282] Add success.
W0320 17:17:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:17:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:17:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:17:13.420442 543705 net.go:648] Add success.
I0320 17:17:13.423553 543705 net.go:770] primary dev: ETH0
I0320 17:17:13.423568 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:17:13.423581 543705 net.go:698] Add success.
I0320 17:17:13.452770 543705 event_worker.go:152] Polling the log file for events...
W0320 17:17:14.455165 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:17:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0320 17:17:14.455179 543705 disk_worker.go:728] disk inode is not compliant
E0320 17:17:14.456927 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 17:17:14.456936 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 17:17:14.456942 543705 custom_config.go:64] query custom config with name: gpu
I0320 17:17:14.456989 543705 disk_worker.go:494] system disk:vda1
I0320 17:17:14.457042 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 17:17:15.456810 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 17:17:15.456819 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:17:16.457945 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 17:17:16.457945 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 17:17:16.458008 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:17:16.458026 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:17:16.472354 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:17:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:17:23.409789 543705 memory.go:184] no items to output this cycle
I0320 17:17:23.409789 543705 cpu.go:275] no items to output this cycle
I0320 17:17:25.349674 543705 disk_info.go:125] begin check local disk info of client
I0320 17:17:25.352148 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:17:25.352155 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007a940 0xc00007aa40]
E0320 17:17:33.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:17:33.409760 543705 memory.go:184] no items to output this cycle
I0320 17:17:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 17:17:43.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:17:43.409839 543705 memory.go:191] Add success.
I0320 17:17:43.409841 543705 cpu.go:282] Add success.
I0320 17:17:43.420241 543705 net.go:648] Add success.
I0320 17:17:43.421212 543705 net.go:770] primary dev: ETH0
I0320 17:17:43.421225 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:17:43.421238 543705 net.go:698] Add success.
I0320 17:17:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:17:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:17:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:17:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:17:53.409798 543705 memory.go:184] no items to output this cycle
I0320 17:17:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 17:18:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:18:03.409778 543705 memory.go:184] no items to output this cycle
I0320 17:18:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 17:18:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:18:13.409783 543705 memory.go:191] Add success.
I0320 17:18:13.409808 543705 cpu.go:282] Add success.
W0320 17:18:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:18:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:18:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:18:13.419728 543705 net.go:648] Add success.
I0320 17:18:13.422355 543705 net.go:770] primary dev: ETH0
I0320 17:18:13.422367 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:18:13.422379 543705 net.go:698] Add success.
I0320 17:18:13.468591 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"00fa8571-f89c-4653-af1e-317b08fa5189","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 17:18:13.468622 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 17:18:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:18:14.455126 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:18:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 17:18:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:18:14.456630 543705 disk_worker.go:494] system disk:vda1
I0320 17:18:14.456662 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:18:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:18:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:18:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:18:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:18:16.472387 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:18:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:18:23.409808 543705 memory.go:184] no items to output this cycle
I0320 17:18:23.409816 543705 cpu.go:275] no items to output this cycle
I0320 17:18:25.353671 543705 disk_info.go:125] begin check local disk info of client
I0320 17:18:25.356130 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:18:25.356136 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5380 0xc0000c53c0]
E0320 17:18:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:18:33.409786 543705 memory.go:184] no items to output this cycle
I0320 17:18:33.409788 543705 cpu.go:275] no items to output this cycle
I0320 17:18:38.564564 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 17:18:38.564571 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 17:18:43.409807 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:18:43.409825 543705 cpu.go:282] Add success.
I0320 17:18:43.409839 543705 memory.go:191] Add success.
I0320 17:18:43.420297 543705 net.go:648] Add success.
I0320 17:18:43.421489 543705 net.go:770] primary dev: ETH0
I0320 17:18:43.421504 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:18:43.421518 543705 net.go:698] Add success.
I0320 17:18:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:18:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:18:46.458091 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:18:53.410749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:18:53.410766 543705 memory.go:184] no items to output this cycle
I0320 17:18:53.410772 543705 cpu.go:275] no items to output this cycle
E0320 17:19:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:19:03.409778 543705 memory.go:184] no items to output this cycle
I0320 17:19:03.409804 543705 cpu.go:275] no items to output this cycle
I0320 17:19:13.409898 543705 cpu.go:282] Add success.
E0320 17:19:13.410036 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:19:13.410054 543705 memory.go:191] Add success.
W0320 17:19:13.410081 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:19:13.410094 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:19:13.410097 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:19:13.419742 543705 net.go:648] Add success.
I0320 17:19:13.422363 543705 net.go:770] primary dev: ETH0
I0320 17:19:13.422377 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:19:13.422390 543705 net.go:698] Add success.
I0320 17:19:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:19:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:19:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 17:19:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:19:14.456568 543705 disk_worker.go:494] system disk:vda1
I0320 17:19:14.456597 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:19:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:19:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:19:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:19:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:19:16.472378 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:19:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:19:23.409768 543705 memory.go:184] no items to output this cycle
I0320 17:19:23.409799 543705 cpu.go:275] no items to output this cycle
I0320 17:19:25.357676 543705 disk_info.go:125] begin check local disk info of client
I0320 17:19:25.360171 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:19:25.360178 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034c540 0xc00034c580]
E0320 17:19:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:19:33.409778 543705 memory.go:184] no items to output this cycle
I0320 17:19:33.409783 543705 cpu.go:275] no items to output this cycle
E0320 17:19:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:19:43.409789 543705 memory.go:191] Add success.
I0320 17:19:43.409850 543705 cpu.go:282] Add success.
I0320 17:19:43.420412 543705 net.go:648] Add success.
I0320 17:19:43.421366 543705 net.go:770] primary dev: ETH0
I0320 17:19:43.421384 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:19:43.421404 543705 net.go:698] Add success.
I0320 17:19:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:19:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:19:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:19:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:19:53.409800 543705 memory.go:184] no items to output this cycle
I0320 17:19:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 17:20:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:20:03.409781 543705 memory.go:184] no items to output this cycle
I0320 17:20:03.409891 543705 cpu.go:275] no items to output this cycle
E0320 17:20:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:20:13.409785 543705 memory.go:191] Add success.
I0320 17:20:13.409802 543705 cpu.go:282] Add success.
W0320 17:20:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:20:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:20:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:20:13.420370 543705 net.go:648] Add success.
I0320 17:20:13.422947 543705 net.go:770] primary dev: ETH0
I0320 17:20:13.422961 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:20:13.422972 543705 net.go:698] Add success.
I0320 17:20:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:20:14.455167 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:20:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0320 17:20:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:20:14.456508 543705 disk_worker.go:494] system disk:vda1
I0320 17:20:14.456552 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:20:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:20:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:20:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:20:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:20:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:20:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:20:23.409800 543705 memory.go:184] no items to output this cycle
I0320 17:20:23.409813 543705 cpu.go:275] no items to output this cycle
I0320 17:20:25.361691 543705 disk_info.go:125] begin check local disk info of client
I0320 17:20:25.364164 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:20:25.364171 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4940 0xc0000c4980]
E0320 17:20:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:20:33.409782 543705 memory.go:184] no items to output this cycle
I0320 17:20:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 17:20:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:20:43.409818 543705 memory.go:191] Add success.
I0320 17:20:43.409857 543705 cpu.go:282] Add success.
I0320 17:20:43.420033 543705 net.go:648] Add success.
I0320 17:20:43.422826 543705 net.go:770] primary dev: ETH0
I0320 17:20:43.422841 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:20:43.422855 543705 net.go:698] Add success.
I0320 17:20:46.457671 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:20:46.457744 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:20:46.457770 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:20:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:20:53.409787 543705 cpu.go:275] no items to output this cycle
I0320 17:20:53.409793 543705 memory.go:184] no items to output this cycle
E0320 17:21:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:21:03.409802 543705 memory.go:184] no items to output this cycle
I0320 17:21:03.409815 543705 cpu.go:275] no items to output this cycle
E0320 17:21:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:21:13.409795 543705 memory.go:191] Add success.
I0320 17:21:13.409796 543705 cpu.go:282] Add success.
W0320 17:21:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:21:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:21:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:21:13.420212 543705 net.go:648] Add success.
I0320 17:21:13.422923 543705 net.go:770] primary dev: ETH0
I0320 17:21:13.422936 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:21:13.422947 543705 net.go:698] Add success.
I0320 17:21:13.469756 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4198d147-2a8d-4174-99af-b8108b485af4","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 17:21:13.469789 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 17:21:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:21:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:21:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 17:21:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:21:14.456659 543705 disk_worker.go:494] system disk:vda1
I0320 17:21:14.456689 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:21:15.455604 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:21:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:21:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:21:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:21:16.472380 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:21:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:21:23.409766 543705 memory.go:184] no items to output this cycle
I0320 17:21:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 17:21:25.365676 543705 disk_info.go:125] begin check local disk info of client
I0320 17:21:25.368146 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:21:25.368151 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb6c0 0xc0001fb700]
E0320 17:21:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:21:33.409783 543705 memory.go:184] no items to output this cycle
I0320 17:21:33.409785 543705 cpu.go:275] no items to output this cycle
I0320 17:21:38.565574 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 17:21:38.565582 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 17:21:43.409805 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:21:43.409815 543705 cpu.go:282] Add success.
I0320 17:21:43.409837 543705 memory.go:191] Add success.
I0320 17:21:43.420219 543705 net.go:648] Add success.
I0320 17:21:43.421103 543705 net.go:770] primary dev: ETH0
I0320 17:21:43.421118 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:21:43.421131 543705 net.go:698] Add success.
I0320 17:21:46.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:21:46.458067 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:21:46.458094 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:21:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:21:53.409775 543705 memory.go:184] no items to output this cycle
I0320 17:21:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 17:22:03.409904 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:22:03.409917 543705 cpu.go:275] no items to output this cycle
I0320 17:22:03.409922 543705 memory.go:184] no items to output this cycle
E0320 17:22:13.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:22:13.409834 543705 memory.go:191] Add success.
I0320 17:22:13.409838 543705 cpu.go:282] Add success.
W0320 17:22:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:22:13.409881 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:22:13.409885 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:22:13.420133 543705 net.go:648] Add success.
I0320 17:22:13.422992 543705 net.go:770] primary dev: ETH0
I0320 17:22:13.423005 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:22:13.423018 543705 net.go:698] Add success.
W0320 17:22:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:22:14.455219 543705 disk_worker.go:708] disk space is not compliant
W0320 17:22:14.455222 543705 disk_worker.go:728] disk inode is not compliant
E0320 17:22:14.456124 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 17:22:14.456135 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 17:22:14.456141 543705 custom_config.go:64] query custom config with name: gpu
I0320 17:22:14.456687 543705 disk_worker.go:494] system disk:vda1
I0320 17:22:14.456722 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 17:22:15.456875 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 17:22:15.456885 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:22:16.457936 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 17:22:16.457940 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 17:22:16.457990 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:22:16.458008 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:22:16.472426 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:22:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:22:23.409800 543705 memory.go:184] no items to output this cycle
I0320 17:22:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 17:22:25.369671 543705 disk_info.go:125] begin check local disk info of client
I0320 17:22:25.372144 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:22:25.372150 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001abb40 0xc0001abb80]
E0320 17:22:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:22:33.409782 543705 memory.go:184] no items to output this cycle
I0320 17:22:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 17:22:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:22:43.409802 543705 memory.go:191] Add success.
I0320 17:22:43.409843 543705 cpu.go:282] Add success.
I0320 17:22:43.420569 543705 net.go:648] Add success.
I0320 17:22:43.423314 543705 net.go:770] primary dev: ETH0
I0320 17:22:43.423334 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:22:43.423354 543705 net.go:698] Add success.
I0320 17:22:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:22:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:22:46.458088 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:22:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:22:53.409779 543705 memory.go:184] no items to output this cycle
I0320 17:22:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 17:23:03.409846 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:23:03.409874 543705 memory.go:184] no items to output this cycle
I0320 17:23:03.409950 543705 cpu.go:275] no items to output this cycle
E0320 17:23:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:23:13.409780 543705 memory.go:191] Add success.
W0320 17:23:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 17:23:13.409812 543705 cpu.go:282] Add success.
W0320 17:23:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:23:13.409819 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:23:13.420241 543705 net.go:648] Add success.
I0320 17:23:13.423072 543705 net.go:770] primary dev: ETH0
I0320 17:23:13.423084 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:23:13.423096 543705 net.go:698] Add success.
I0320 17:23:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:23:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:23:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0320 17:23:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:23:14.456573 543705 disk_worker.go:494] system disk:vda1
I0320 17:23:14.456603 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:23:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:23:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:23:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:23:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:23:16.472399 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:23:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:23:23.409782 543705 memory.go:184] no items to output this cycle
I0320 17:23:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 17:23:25.373672 543705 disk_info.go:125] begin check local disk info of client
I0320 17:23:25.376236 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:23:25.376242 543705 disk_info.go:196] parse disk info done, disk is : [0xc000470140 0xc000470180]
E0320 17:23:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:23:33.409773 543705 memory.go:184] no items to output this cycle
I0320 17:23:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 17:23:43.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:23:43.409826 543705 memory.go:191] Add success.
I0320 17:23:43.409836 543705 cpu.go:282] Add success.
I0320 17:23:43.420138 543705 net.go:648] Add success.
I0320 17:23:43.422886 543705 net.go:770] primary dev: ETH0
I0320 17:23:43.422902 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:23:43.422916 543705 net.go:698] Add success.
I0320 17:23:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:23:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:23:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:23:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:23:53.409800 543705 memory.go:184] no items to output this cycle
I0320 17:23:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 17:24:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:24:03.409798 543705 memory.go:184] no items to output this cycle
I0320 17:24:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 17:24:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:24:13.409817 543705 memory.go:191] Add success.
I0320 17:24:13.409824 543705 cpu.go:282] Add success.
W0320 17:24:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:24:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:24:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:24:13.420138 543705 net.go:648] Add success.
I0320 17:24:13.422757 543705 net.go:770] primary dev: ETH0
I0320 17:24:13.422771 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:24:13.422783 543705 net.go:698] Add success.
I0320 17:24:13.464002 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"18f74db0-96a8-47a3-bdba-27190c3a7d1f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 17:24:13.464034 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 17:24:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:24:14.455158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:24:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0320 17:24:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:24:14.456585 543705 disk_worker.go:494] system disk:vda1
I0320 17:24:14.456614 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:24:15.455618 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:24:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:24:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:24:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:24:16.472426 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:24:23.410407 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:24:23.410425 543705 memory.go:184] no items to output this cycle
I0320 17:24:23.410438 543705 cpu.go:275] no items to output this cycle
I0320 17:24:25.377671 543705 disk_info.go:125] begin check local disk info of client
I0320 17:24:25.380130 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:24:25.380136 543705 disk_info.go:196] parse disk info done, disk is : [0xc000492ec0 0xc000492f00]
E0320 17:24:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:24:33.409791 543705 memory.go:184] no items to output this cycle
I0320 17:24:33.409803 543705 cpu.go:275] no items to output this cycle
I0320 17:24:38.566585 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 17:24:38.566592 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 17:24:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:24:43.410664 543705 memory.go:191] Add success.
I0320 17:24:43.409815 543705 cpu.go:282] Add success.
I0320 17:24:43.420495 543705 net.go:648] Add success.
I0320 17:24:43.423189 543705 net.go:770] primary dev: ETH0
I0320 17:24:43.423209 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:24:43.423228 543705 net.go:698] Add success.
I0320 17:24:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:24:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:24:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:24:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:24:53.409776 543705 cpu.go:275] no items to output this cycle
I0320 17:24:53.409788 543705 memory.go:184] no items to output this cycle
E0320 17:25:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:25:03.409800 543705 memory.go:184] no items to output this cycle
I0320 17:25:03.409815 543705 cpu.go:275] no items to output this cycle
E0320 17:25:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:25:13.409786 543705 memory.go:191] Add success.
I0320 17:25:13.409794 543705 cpu.go:282] Add success.
W0320 17:25:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:25:13.412650 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:25:13.412654 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:25:13.420292 543705 net.go:648] Add success.
I0320 17:25:13.422002 543705 net.go:770] primary dev: ETH0
I0320 17:25:13.422016 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:25:13.422028 543705 net.go:698] Add success.
I0320 17:25:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:25:14.455148 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:25:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0320 17:25:14.455160 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:25:14.456492 543705 disk_worker.go:494] system disk:vda1
I0320 17:25:14.456537 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:25:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:25:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:25:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:25:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:25:16.472436 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:25:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:25:23.409807 543705 memory.go:184] no items to output this cycle
I0320 17:25:23.409819 543705 cpu.go:275] no items to output this cycle
I0320 17:25:25.381671 543705 disk_info.go:125] begin check local disk info of client
I0320 17:25:25.384168 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:25:25.384174 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007bd00 0xc00007bd80]
E0320 17:25:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:25:33.409780 543705 memory.go:184] no items to output this cycle
I0320 17:25:33.409789 543705 cpu.go:275] no items to output this cycle
E0320 17:25:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:25:43.409787 543705 memory.go:191] Add success.
I0320 17:25:43.409788 543705 cpu.go:282] Add success.
I0320 17:25:43.419873 543705 net.go:648] Add success.
I0320 17:25:43.422890 543705 net.go:770] primary dev: ETH0
I0320 17:25:43.422903 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:25:43.422915 543705 net.go:698] Add success.
I0320 17:25:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:25:46.458066 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:25:46.458095 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:25:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:25:53.409781 543705 memory.go:184] no items to output this cycle
I0320 17:25:53.409814 543705 cpu.go:275] no items to output this cycle
E0320 17:26:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:26:03.409804 543705 memory.go:184] no items to output this cycle
I0320 17:26:03.409815 543705 cpu.go:275] no items to output this cycle
E0320 17:26:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:26:13.409828 543705 memory.go:191] Add success.
I0320 17:26:13.409836 543705 cpu.go:282] Add success.
W0320 17:26:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:26:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:26:13.409878 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:26:13.420176 543705 net.go:648] Add success.
I0320 17:26:13.422841 543705 net.go:770] primary dev: ETH0
I0320 17:26:13.422854 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:26:13.422865 543705 net.go:698] Add success.
I0320 17:26:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:26:14.455208 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:26:14.455220 543705 disk_worker.go:708] disk space is not compliant
W0320 17:26:14.455223 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:26:14.456580 543705 disk_worker.go:494] system disk:vda1
I0320 17:26:14.456613 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:26:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:26:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:26:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:26:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:26:16.472385 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:26:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:26:23.409787 543705 cpu.go:275] no items to output this cycle
I0320 17:26:23.409789 543705 memory.go:184] no items to output this cycle
I0320 17:26:25.385672 543705 disk_info.go:125] begin check local disk info of client
I0320 17:26:25.387891 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:26:25.387897 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa840 0xc0001aa880]
E0320 17:26:33.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:26:33.409815 543705 memory.go:184] no items to output this cycle
I0320 17:26:33.409824 543705 cpu.go:275] no items to output this cycle
E0320 17:26:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:26:43.409795 543705 memory.go:191] Add success.
I0320 17:26:43.409824 543705 cpu.go:282] Add success.
I0320 17:26:43.419977 543705 net.go:648] Add success.
I0320 17:26:43.422476 543705 net.go:770] primary dev: ETH0
I0320 17:26:43.422491 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:26:43.422506 543705 net.go:698] Add success.
I0320 17:26:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:26:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:26:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:26:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:26:53.409800 543705 memory.go:184] no items to output this cycle
I0320 17:26:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 17:27:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:27:03.409790 543705 memory.go:184] no items to output this cycle
I0320 17:27:03.409806 543705 cpu.go:275] no items to output this cycle
E0320 17:27:13.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:27:13.409821 543705 cpu.go:282] Add success.
I0320 17:27:13.409829 543705 memory.go:191] Add success.
W0320 17:27:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:27:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:27:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:27:13.420138 543705 net.go:648] Add success.
I0320 17:27:13.429191 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 17:27:13.429268 543705 net.go:770] primary dev: ETH0
I0320 17:27:13.429282 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:27:13.429296 543705 net.go:698] Add success.
I0320 17:27:13.452770 543705 event_worker.go:152] Polling the log file for events...
I0320 17:27:13.467633 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8c2a9952-ca3f-4a1b-851b-8a5b200ef079","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 17:27:13.467668 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 17:27:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:27:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 17:27:14.455192 543705 disk_worker.go:728] disk inode is not compliant
E0320 17:27:14.455942 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 17:27:14.455951 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 17:27:14.455957 543705 custom_config.go:64] query custom config with name: gpu
I0320 17:27:14.456558 543705 disk_worker.go:494] system disk:vda1
I0320 17:27:14.456589 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 17:27:15.456837 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 17:27:15.456846 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:27:16.457960 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 17:27:16.457960 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 17:27:16.458014 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:27:16.458032 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:27:16.472354 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:27:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:27:23.409788 543705 memory.go:184] no items to output this cycle
I0320 17:27:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 17:27:25.392063 543705 disk_info.go:125] begin check local disk info of client
I0320 17:27:25.394547 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:27:25.394553 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa3c0 0xc0001aa400]
E0320 17:27:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:27:33.409782 543705 memory.go:184] no items to output this cycle
I0320 17:27:33.409795 543705 cpu.go:275] no items to output this cycle
I0320 17:27:38.567587 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 17:27:38.567594 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 17:27:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:27:43.410823 543705 memory.go:191] Add success.
I0320 17:27:43.409813 543705 cpu.go:282] Add success.
I0320 17:27:43.420550 543705 net.go:648] Add success.
I0320 17:27:43.423205 543705 net.go:770] primary dev: ETH0
I0320 17:27:43.423218 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:27:43.423230 543705 net.go:698] Add success.
I0320 17:27:46.458001 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:27:46.458075 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:27:46.458103 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:27:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:27:53.409802 543705 memory.go:184] no items to output this cycle
I0320 17:27:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 17:28:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:28:03.409789 543705 memory.go:184] no items to output this cycle
I0320 17:28:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 17:28:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:28:13.409794 543705 memory.go:191] Add success.
I0320 17:28:13.409797 543705 cpu.go:282] Add success.
W0320 17:28:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:28:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:28:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:28:13.420095 543705 net.go:648] Add success.
I0320 17:28:13.422962 543705 net.go:770] primary dev: ETH0
I0320 17:28:13.422977 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:28:13.422991 543705 net.go:698] Add success.
I0320 17:28:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:28:14.455203 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:28:14.455216 543705 disk_worker.go:708] disk space is not compliant
W0320 17:28:14.455218 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:28:14.456779 543705 disk_worker.go:494] system disk:vda1
I0320 17:28:14.456808 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:28:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:28:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:28:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:28:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:28:16.472403 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:28:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:28:23.409778 543705 memory.go:184] no items to output this cycle
I0320 17:28:23.409781 543705 cpu.go:275] no items to output this cycle
I0320 17:28:25.397674 543705 disk_info.go:125] begin check local disk info of client
I0320 17:28:25.400098 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:28:25.400105 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003c0740 0xc0003c0780]
E0320 17:28:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:28:33.409768 543705 memory.go:184] no items to output this cycle
I0320 17:28:33.409799 543705 cpu.go:275] no items to output this cycle
E0320 17:28:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:28:43.409781 543705 memory.go:191] Add success.
I0320 17:28:43.409814 543705 cpu.go:282] Add success.
I0320 17:28:43.419985 543705 net.go:648] Add success.
I0320 17:28:43.422547 543705 net.go:770] primary dev: ETH0
I0320 17:28:43.422562 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:28:43.422578 543705 net.go:698] Add success.
I0320 17:28:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:28:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:28:46.458089 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:28:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:28:53.409798 543705 memory.go:184] no items to output this cycle
I0320 17:28:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 17:29:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:29:03.409807 543705 memory.go:184] no items to output this cycle
I0320 17:29:03.409819 543705 cpu.go:275] no items to output this cycle
E0320 17:29:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:29:13.409800 543705 cpu.go:282] Add success.
I0320 17:29:13.409803 543705 memory.go:191] Add success.
W0320 17:29:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:29:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:29:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:29:13.420291 543705 net.go:648] Add success.
I0320 17:29:13.423188 543705 net.go:770] primary dev: ETH0
I0320 17:29:13.423203 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:29:13.423217 543705 net.go:698] Add success.
I0320 17:29:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:29:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:29:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 17:29:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:29:14.456529 543705 disk_worker.go:494] system disk:vda1
I0320 17:29:14.456573 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:29:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:29:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:29:16.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:29:16.458085 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:29:16.472417 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:29:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:29:23.409792 543705 memory.go:184] no items to output this cycle
I0320 17:29:23.409806 543705 cpu.go:275] no items to output this cycle
I0320 17:29:25.402118 543705 disk_info.go:125] begin check local disk info of client
I0320 17:29:25.404592 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:29:25.404599 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4c40 0xc0000c4c80]
E0320 17:29:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:29:33.409774 543705 memory.go:184] no items to output this cycle
I0320 17:29:33.409782 543705 cpu.go:275] no items to output this cycle
E0320 17:29:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:29:43.409779 543705 memory.go:191] Add success.
I0320 17:29:43.409802 543705 cpu.go:282] Add success.
I0320 17:29:43.419853 543705 net.go:648] Add success.
I0320 17:29:43.422410 543705 net.go:770] primary dev: ETH0
I0320 17:29:43.422422 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:29:43.422435 543705 net.go:698] Add success.
I0320 17:29:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:29:46.458064 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:29:46.458096 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:29:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:29:53.409765 543705 memory.go:184] no items to output this cycle
I0320 17:29:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 17:30:03.409873 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:30:03.409893 543705 memory.go:184] no items to output this cycle
I0320 17:30:03.409950 543705 cpu.go:275] no items to output this cycle
E0320 17:30:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:30:13.409802 543705 memory.go:191] Add success.
I0320 17:30:13.409806 543705 cpu.go:282] Add success.
W0320 17:30:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:30:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:30:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:30:13.420071 543705 net.go:648] Add success.
I0320 17:30:13.423217 543705 net.go:770] primary dev: ETH0
I0320 17:30:13.423230 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:30:13.423242 543705 net.go:698] Add success.
I0320 17:30:13.467639 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3a66bbac-9a0d-49ec-bc5d-811df56d0430","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 17:30:13.467673 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 17:30:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:30:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:30:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0320 17:30:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:30:14.456527 543705 disk_worker.go:494] system disk:vda1
I0320 17:30:14.456582 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:30:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:30:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:30:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:30:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:30:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:30:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:30:23.409779 543705 memory.go:184] no items to output this cycle
I0320 17:30:23.409780 543705 cpu.go:275] no items to output this cycle
I0320 17:30:25.405670 543705 disk_info.go:125] begin check local disk info of client
I0320 17:30:25.408151 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:30:25.408157 543705 disk_info.go:196] parse disk info done, disk is : [0xc000471640 0xc000471680]
E0320 17:30:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:30:33.409767 543705 memory.go:184] no items to output this cycle
I0320 17:30:33.409783 543705 cpu.go:275] no items to output this cycle
I0320 17:30:38.568602 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 17:30:38.568608 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 17:30:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:30:43.410756 543705 memory.go:191] Add success.
I0320 17:30:43.409817 543705 cpu.go:282] Add success.
I0320 17:30:43.420464 543705 net.go:648] Add success.
I0320 17:30:43.423664 543705 net.go:770] primary dev: ETH0
I0320 17:30:43.423680 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:30:43.423694 543705 net.go:698] Add success.
I0320 17:30:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:30:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:30:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:30:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:30:53.409785 543705 memory.go:184] no items to output this cycle
I0320 17:30:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 17:31:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:31:03.409807 543705 memory.go:184] no items to output this cycle
I0320 17:31:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 17:31:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:31:13.409810 543705 memory.go:191] Add success.
I0320 17:31:13.409818 543705 cpu.go:282] Add success.
W0320 17:31:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:31:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:31:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:31:13.420143 543705 net.go:648] Add success.
I0320 17:31:13.422765 543705 net.go:770] primary dev: ETH0
I0320 17:31:13.422779 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:31:13.422793 543705 net.go:698] Add success.
I0320 17:31:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:31:14.455094 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:31:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0320 17:31:14.455161 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:31:14.456497 543705 disk_worker.go:494] system disk:vda1
I0320 17:31:14.456541 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:31:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:31:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:31:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:31:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:31:16.472389 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:31:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:31:23.409763 543705 memory.go:184] no items to output this cycle
I0320 17:31:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 17:31:25.409674 543705 disk_info.go:125] begin check local disk info of client
I0320 17:31:25.412140 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:31:25.412147 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048b400 0xc00048b440]
E0320 17:31:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:31:33.409768 543705 memory.go:184] no items to output this cycle
I0320 17:31:33.409790 543705 cpu.go:275] no items to output this cycle
E0320 17:31:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:31:43.409787 543705 memory.go:191] Add success.
I0320 17:31:43.409801 543705 cpu.go:282] Add success.
I0320 17:31:43.420036 543705 net.go:648] Add success.
I0320 17:31:43.422680 543705 net.go:770] primary dev: ETH0
I0320 17:31:43.422696 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:31:43.422709 543705 net.go:698] Add success.
I0320 17:31:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:31:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:31:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:31:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:31:53.409769 543705 memory.go:184] no items to output this cycle
I0320 17:31:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 17:32:03.409897 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:32:03.409904 543705 cpu.go:275] no items to output this cycle
I0320 17:32:03.409916 543705 memory.go:184] no items to output this cycle
E0320 17:32:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:32:13.409797 543705 cpu.go:282] Add success.
I0320 17:32:13.409818 543705 memory.go:191] Add success.
W0320 17:32:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:32:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:32:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:32:13.420163 543705 net.go:648] Add success.
I0320 17:32:13.423378 543705 net.go:770] primary dev: ETH0
I0320 17:32:13.423391 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:32:13.423402 543705 net.go:698] Add success.
W0320 17:32:14.455167 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:32:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 17:32:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:32:14.456950 543705 disk_worker.go:494] system disk:vda1
I0320 17:32:14.457005 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 17:32:14.457012 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 17:32:14.457019 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 17:32:14.457024 543705 custom_config.go:64] query custom config with name: gpu
E0320 17:32:15.456834 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 17:32:15.456843 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:32:16.457896 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 17:32:16.457896 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 17:32:16.457949 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:32:16.457969 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:32:16.472318 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:32:23.409741 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:32:23.409757 543705 memory.go:184] no items to output this cycle
I0320 17:32:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 17:32:25.412785 543705 disk_info.go:125] begin check local disk info of client
I0320 17:32:25.415232 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:32:25.415238 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001faf00 0xc0001faf40]
E0320 17:32:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:32:33.409778 543705 memory.go:184] no items to output this cycle
I0320 17:32:33.409781 543705 cpu.go:275] no items to output this cycle
E0320 17:32:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:32:43.409811 543705 memory.go:191] Add success.
I0320 17:32:43.409821 543705 cpu.go:282] Add success.
I0320 17:32:43.419922 543705 net.go:648] Add success.
I0320 17:32:43.422413 543705 net.go:770] primary dev: ETH0
I0320 17:32:43.422426 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:32:43.422439 543705 net.go:698] Add success.
I0320 17:32:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:32:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:32:46.458094 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:32:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:32:53.409785 543705 memory.go:184] no items to output this cycle
I0320 17:32:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 17:33:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:33:03.409774 543705 memory.go:184] no items to output this cycle
I0320 17:33:03.409809 543705 cpu.go:275] no items to output this cycle
E0320 17:33:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:33:13.409800 543705 memory.go:191] Add success.
I0320 17:33:13.409802 543705 cpu.go:282] Add success.
W0320 17:33:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:33:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:33:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:33:13.420156 543705 net.go:648] Add success.
I0320 17:33:13.423095 543705 net.go:770] primary dev: ETH0
I0320 17:33:13.423110 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:33:13.423123 543705 net.go:698] Add success.
I0320 17:33:13.660878 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"bcdec7f9-b8ae-43fd-b369-129a4b90edd5","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 17:33:13.660917 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 17:33:14.454728 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:33:14.454906 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:33:14.454917 543705 disk_worker.go:708] disk space is not compliant
W0320 17:33:14.454919 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:33:14.456289 543705 disk_worker.go:494] system disk:vda1
I0320 17:33:14.456334 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:33:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:33:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:33:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:33:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:33:16.472431 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:33:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:33:23.409797 543705 memory.go:184] no items to output this cycle
I0320 17:33:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 17:33:25.415783 543705 disk_info.go:125] begin check local disk info of client
I0320 17:33:25.418249 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:33:25.418254 543705 disk_info.go:196] parse disk info done, disk is : [0xc00027b140 0xc00027b180]
E0320 17:33:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:33:33.409770 543705 memory.go:184] no items to output this cycle
I0320 17:33:33.409793 543705 cpu.go:275] no items to output this cycle
I0320 17:33:38.569597 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 17:33:38.569604 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 17:33:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:33:43.409784 543705 memory.go:191] Add success.
I0320 17:33:43.409791 543705 cpu.go:282] Add success.
I0320 17:33:43.419955 543705 net.go:648] Add success.
I0320 17:33:43.420948 543705 net.go:770] primary dev: ETH0
I0320 17:33:43.420964 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:33:43.420977 543705 net.go:698] Add success.
I0320 17:33:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:33:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:33:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:33:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:33:53.409905 543705 memory.go:184] no items to output this cycle
I0320 17:33:53.409941 543705 cpu.go:275] no items to output this cycle
E0320 17:34:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:34:03.409784 543705 memory.go:184] no items to output this cycle
I0320 17:34:03.409802 543705 cpu.go:275] no items to output this cycle
E0320 17:34:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:34:13.409781 543705 memory.go:191] Add success.
I0320 17:34:13.409806 543705 cpu.go:282] Add success.
W0320 17:34:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:34:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:34:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:34:13.420182 543705 net.go:648] Add success.
I0320 17:34:13.423023 543705 net.go:770] primary dev: ETH0
I0320 17:34:13.423037 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:34:13.423051 543705 net.go:698] Add success.
I0320 17:34:14.454982 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:34:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:34:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 17:34:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:34:14.456521 543705 disk_worker.go:494] system disk:vda1
I0320 17:34:14.456566 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:34:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:34:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:34:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:34:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:34:16.472398 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:34:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:34:23.409793 543705 memory.go:184] no items to output this cycle
I0320 17:34:23.409807 543705 cpu.go:275] no items to output this cycle
I0320 17:34:25.418785 543705 disk_info.go:125] begin check local disk info of client
I0320 17:34:25.421231 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:34:25.421237 543705 disk_info.go:196] parse disk info done, disk is : [0xc000261a40 0xc000261a80]
E0320 17:34:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:34:33.409801 543705 memory.go:184] no items to output this cycle
I0320 17:34:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 17:34:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:34:43.409788 543705 memory.go:191] Add success.
I0320 17:34:43.409789 543705 cpu.go:282] Add success.
I0320 17:34:43.420019 543705 net.go:648] Add success.
I0320 17:34:43.422918 543705 net.go:770] primary dev: ETH0
I0320 17:34:43.422931 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:34:43.422945 543705 net.go:698] Add success.
I0320 17:34:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:34:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:34:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:34:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:34:53.409771 543705 memory.go:184] no items to output this cycle
I0320 17:34:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 17:35:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:35:03.409791 543705 memory.go:184] no items to output this cycle
I0320 17:35:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 17:35:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:35:13.409817 543705 memory.go:191] Add success.
I0320 17:35:13.409824 543705 cpu.go:282] Add success.
W0320 17:35:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:35:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:35:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:35:13.420197 543705 net.go:648] Add success.
I0320 17:35:13.422984 543705 net.go:770] primary dev: ETH0
I0320 17:35:13.422997 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:35:13.423009 543705 net.go:698] Add success.
I0320 17:35:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:35:14.455101 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:35:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0320 17:35:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:35:14.456583 543705 disk_worker.go:494] system disk:vda1
I0320 17:35:14.456614 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:35:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:35:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:35:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:35:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:35:16.472382 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:35:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:35:23.409799 543705 memory.go:184] no items to output this cycle
I0320 17:35:23.409807 543705 cpu.go:275] no items to output this cycle
I0320 17:35:25.421809 543705 disk_info.go:125] begin check local disk info of client
I0320 17:35:25.424249 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:35:25.424255 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007a740 0xc00007a780]
E0320 17:35:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:35:33.409794 543705 memory.go:184] no items to output this cycle
I0320 17:35:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 17:35:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:35:43.409786 543705 memory.go:191] Add success.
I0320 17:35:43.409789 543705 cpu.go:282] Add success.
I0320 17:35:43.420001 543705 net.go:648] Add success.
I0320 17:35:43.422620 543705 net.go:770] primary dev: ETH0
I0320 17:35:43.422633 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:35:43.422645 543705 net.go:698] Add success.
I0320 17:35:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:35:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:35:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:35:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:35:53.409783 543705 memory.go:184] no items to output this cycle
I0320 17:35:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 17:36:03.409873 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:36:03.409899 543705 memory.go:184] no items to output this cycle
I0320 17:36:03.410038 543705 cpu.go:275] no items to output this cycle
E0320 17:36:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:36:13.409815 543705 memory.go:191] Add success.
I0320 17:36:13.409829 543705 cpu.go:282] Add success.
W0320 17:36:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:36:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:36:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:36:13.420175 543705 net.go:648] Add success.
I0320 17:36:13.423037 543705 net.go:770] primary dev: ETH0
I0320 17:36:13.423050 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:36:13.423062 543705 net.go:698] Add success.
I0320 17:36:13.469655 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2a25599f-4839-40b3-a45f-1fbfd7ea8cd3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 17:36:13.469691 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 17:36:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:36:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:36:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 17:36:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:36:14.456680 543705 disk_worker.go:494] system disk:vda1
I0320 17:36:14.456720 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:36:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:36:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:36:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:36:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:36:16.472400 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:36:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:36:23.409765 543705 memory.go:184] no items to output this cycle
I0320 17:36:23.409798 543705 cpu.go:275] no items to output this cycle
I0320 17:36:25.424807 543705 disk_info.go:125] begin check local disk info of client
I0320 17:36:25.427274 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:36:25.427280 543705 disk_info.go:196] parse disk info done, disk is : [0xc000370480 0xc0003704c0]
E0320 17:36:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:36:33.409784 543705 memory.go:184] no items to output this cycle
I0320 17:36:33.409785 543705 cpu.go:275] no items to output this cycle
I0320 17:36:38.570604 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 17:36:38.570610 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 17:36:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:36:43.410804 543705 memory.go:191] Add success.
I0320 17:36:43.409801 543705 cpu.go:282] Add success.
I0320 17:36:43.420510 543705 net.go:648] Add success.
I0320 17:36:43.423591 543705 net.go:770] primary dev: ETH0
I0320 17:36:43.423604 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:36:43.423617 543705 net.go:698] Add success.
I0320 17:36:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:36:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:36:46.458088 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:36:53.409878 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:36:53.409891 543705 cpu.go:275] no items to output this cycle
I0320 17:36:53.409899 543705 memory.go:184] no items to output this cycle
E0320 17:37:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:37:03.409781 543705 memory.go:184] no items to output this cycle
I0320 17:37:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 17:37:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:37:13.409783 543705 memory.go:191] Add success.
W0320 17:37:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 17:37:13.409810 543705 cpu.go:282] Add success.
W0320 17:37:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:37:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:37:13.420193 543705 net.go:648] Add success.
I0320 17:37:13.422947 543705 net.go:770] primary dev: ETH0
I0320 17:37:13.422962 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:37:13.422976 543705 net.go:698] Add success.
I0320 17:37:13.453520 543705 event_worker.go:152] Polling the log file for events...
W0320 17:37:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:37:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0320 17:37:14.455183 543705 disk_worker.go:728] disk inode is not compliant
E0320 17:37:14.455893 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 17:37:14.455902 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 17:37:14.455907 543705 custom_config.go:64] query custom config with name: gpu
I0320 17:37:14.456559 543705 disk_worker.go:494] system disk:vda1
I0320 17:37:14.456589 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 17:37:15.456830 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 17:37:15.456839 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:37:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 17:37:16.457994 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 17:37:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:37:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:37:16.472355 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:37:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:37:23.409776 543705 memory.go:184] no items to output this cycle
I0320 17:37:23.409778 543705 cpu.go:275] no items to output this cycle
I0320 17:37:25.427828 543705 disk_info.go:125] begin check local disk info of client
I0320 17:37:25.430286 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:37:25.430292 543705 disk_info.go:196] parse disk info done, disk is : [0xc000391c40 0xc000391c80]
E0320 17:37:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:37:33.409767 543705 memory.go:184] no items to output this cycle
I0320 17:37:33.409804 543705 cpu.go:275] no items to output this cycle
E0320 17:37:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:37:43.409784 543705 memory.go:191] Add success.
I0320 17:37:43.409801 543705 cpu.go:282] Add success.
I0320 17:37:43.420399 543705 net.go:648] Add success.
I0320 17:37:43.423118 543705 net.go:770] primary dev: ETH0
I0320 17:37:43.423132 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:37:43.423148 543705 net.go:698] Add success.
I0320 17:37:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:37:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:37:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:37:53.409884 543705 cpu.go:275] no items to output this cycle
E0320 17:37:53.409972 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:37:53.409993 543705 memory.go:184] no items to output this cycle
E0320 17:38:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:38:03.409794 543705 memory.go:184] no items to output this cycle
I0320 17:38:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 17:38:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:38:13.409797 543705 memory.go:191] Add success.
I0320 17:38:13.409812 543705 cpu.go:282] Add success.
W0320 17:38:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:38:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:38:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:38:13.420119 543705 net.go:648] Add success.
I0320 17:38:13.422965 543705 net.go:770] primary dev: ETH0
I0320 17:38:13.422979 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:38:13.422991 543705 net.go:698] Add success.
I0320 17:38:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:38:14.455204 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:38:14.455216 543705 disk_worker.go:708] disk space is not compliant
W0320 17:38:14.455219 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:38:14.456602 543705 disk_worker.go:494] system disk:vda1
I0320 17:38:14.456632 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:38:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:38:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:38:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:38:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:38:16.472387 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:38:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:38:23.409791 543705 memory.go:184] no items to output this cycle
I0320 17:38:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 17:38:25.430832 543705 disk_info.go:125] begin check local disk info of client
I0320 17:38:25.433317 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:38:25.433324 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b29c0 0xc0002b2a00]
E0320 17:38:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:38:33.409804 543705 memory.go:184] no items to output this cycle
I0320 17:38:33.409817 543705 cpu.go:275] no items to output this cycle
E0320 17:38:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:38:43.409794 543705 memory.go:191] Add success.
I0320 17:38:43.409797 543705 cpu.go:282] Add success.
I0320 17:38:43.419864 543705 net.go:648] Add success.
I0320 17:38:43.422869 543705 net.go:770] primary dev: ETH0
I0320 17:38:43.422882 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:38:43.422895 543705 net.go:698] Add success.
I0320 17:38:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:38:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:38:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:38:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:38:53.409773 543705 memory.go:184] no items to output this cycle
I0320 17:38:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 17:39:03.409877 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:39:03.409898 543705 memory.go:184] no items to output this cycle
I0320 17:39:03.409919 543705 cpu.go:275] no items to output this cycle
E0320 17:39:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:39:13.409801 543705 memory.go:191] Add success.
I0320 17:39:13.409807 543705 cpu.go:282] Add success.
W0320 17:39:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:39:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:39:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:39:13.420141 543705 net.go:648] Add success.
I0320 17:39:13.422918 543705 net.go:770] primary dev: ETH0
I0320 17:39:13.422931 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:39:13.422943 543705 net.go:698] Add success.
I0320 17:39:13.464474 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3cb8c28d-0d72-4131-b6f3-bf1d8981d4de","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 17:39:13.464509 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 17:39:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:39:14.455202 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:39:14.455213 543705 disk_worker.go:708] disk space is not compliant
W0320 17:39:14.455216 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:39:14.456604 543705 disk_worker.go:494] system disk:vda1
I0320 17:39:14.456635 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:39:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:39:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:39:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:39:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:39:16.472398 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:39:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:39:23.409801 543705 memory.go:184] no items to output this cycle
I0320 17:39:23.409814 543705 cpu.go:275] no items to output this cycle
I0320 17:39:25.433866 543705 disk_info.go:125] begin check local disk info of client
I0320 17:39:25.436362 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:39:25.436368 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b2f00 0xc0002b2f40]
E0320 17:39:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:39:33.409796 543705 memory.go:184] no items to output this cycle
I0320 17:39:33.409810 543705 cpu.go:275] no items to output this cycle
I0320 17:39:38.571611 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 17:39:38.571617 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 17:39:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:39:43.410852 543705 memory.go:191] Add success.
I0320 17:39:43.409812 543705 cpu.go:282] Add success.
I0320 17:39:43.420589 543705 net.go:648] Add success.
I0320 17:39:43.423649 543705 net.go:770] primary dev: ETH0
I0320 17:39:43.423662 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:39:43.423673 543705 net.go:698] Add success.
I0320 17:39:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:39:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:39:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:39:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:39:53.409785 543705 memory.go:184] no items to output this cycle
I0320 17:39:53.409822 543705 cpu.go:275] no items to output this cycle
E0320 17:40:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:40:03.409907 543705 cpu.go:275] no items to output this cycle
I0320 17:40:03.409911 543705 memory.go:184] no items to output this cycle
E0320 17:40:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:40:13.409786 543705 memory.go:191] Add success.
W0320 17:40:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 17:40:13.409816 543705 cpu.go:282] Add success.
W0320 17:40:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:40:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:40:13.420052 543705 net.go:648] Add success.
I0320 17:40:13.422837 543705 net.go:770] primary dev: ETH0
I0320 17:40:13.422850 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:40:13.422862 543705 net.go:698] Add success.
I0320 17:40:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:40:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:40:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0320 17:40:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:40:14.456552 543705 disk_worker.go:494] system disk:vda1
I0320 17:40:14.456582 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:40:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:40:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:40:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:40:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:40:16.472379 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:40:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:40:23.409793 543705 memory.go:184] no items to output this cycle
I0320 17:40:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 17:40:25.436873 543705 disk_info.go:125] begin check local disk info of client
I0320 17:40:25.439343 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:40:25.439349 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb800 0xc0001fb840]
E0320 17:40:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:40:33.409776 543705 memory.go:184] no items to output this cycle
I0320 17:40:33.409779 543705 cpu.go:275] no items to output this cycle
E0320 17:40:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:40:43.409783 543705 memory.go:191] Add success.
I0320 17:40:43.409786 543705 cpu.go:282] Add success.
I0320 17:40:43.420002 543705 net.go:648] Add success.
I0320 17:40:43.423082 543705 net.go:770] primary dev: ETH0
I0320 17:40:43.423095 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:40:43.423107 543705 net.go:698] Add success.
I0320 17:40:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:40:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:40:46.458090 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:40:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:40:53.409781 543705 cpu.go:275] no items to output this cycle
I0320 17:40:53.409792 543705 memory.go:184] no items to output this cycle
E0320 17:41:03.409865 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:41:03.409885 543705 memory.go:184] no items to output this cycle
I0320 17:41:03.409946 543705 cpu.go:275] no items to output this cycle
E0320 17:41:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:41:13.409813 543705 memory.go:191] Add success.
I0320 17:41:13.409822 543705 cpu.go:282] Add success.
W0320 17:41:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:41:13.409873 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:41:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:41:13.420115 543705 net.go:648] Add success.
I0320 17:41:13.422720 543705 net.go:770] primary dev: ETH0
I0320 17:41:13.422734 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:41:13.422748 543705 net.go:698] Add success.
I0320 17:41:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:41:14.455153 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:41:14.455163 543705 disk_worker.go:708] disk space is not compliant
W0320 17:41:14.455166 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:41:14.456496 543705 disk_worker.go:494] system disk:vda1
I0320 17:41:14.456539 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:41:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:41:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:41:16.458023 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:41:16.458046 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:41:16.472356 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:41:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:41:23.409792 543705 memory.go:184] no items to output this cycle
I0320 17:41:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 17:41:25.439896 543705 disk_info.go:125] begin check local disk info of client
I0320 17:41:25.442367 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:41:25.442373 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b0a80 0xc0003b0ac0]
E0320 17:41:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:41:33.409773 543705 memory.go:184] no items to output this cycle
I0320 17:41:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 17:41:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:41:43.409788 543705 memory.go:191] Add success.
I0320 17:41:43.409817 543705 cpu.go:282] Add success.
I0320 17:41:43.420442 543705 net.go:648] Add success.
I0320 17:41:43.423346 543705 net.go:770] primary dev: ETH0
I0320 17:41:43.423360 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:41:43.423375 543705 net.go:698] Add success.
I0320 17:41:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:41:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:41:46.458091 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:41:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:41:53.409801 543705 memory.go:184] no items to output this cycle
I0320 17:41:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 17:42:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:42:03.409785 543705 memory.go:184] no items to output this cycle
I0320 17:42:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 17:42:13.409886 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:42:13.409960 543705 memory.go:191] Add success.
W0320 17:42:13.409999 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 17:42:13.410001 543705 cpu.go:282] Add success.
W0320 17:42:13.410016 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:42:13.410021 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:42:13.419720 543705 net.go:648] Add success.
I0320 17:42:13.422590 543705 net.go:770] primary dev: ETH0
I0320 17:42:13.422603 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:42:13.422615 543705 net.go:698] Add success.
I0320 17:42:13.463564 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"83ab9779-4ef5-40bf-b2b3-531db7ed5501","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 17:42:13.463596 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 17:42:14.455107 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:42:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 17:42:14.455169 543705 disk_worker.go:728] disk inode is not compliant
E0320 17:42:14.456079 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 17:42:14.456089 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 17:42:14.456094 543705 custom_config.go:64] query custom config with name: gpu
I0320 17:42:14.456489 543705 disk_worker.go:494] system disk:vda1
I0320 17:42:14.456515 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 17:42:15.456868 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 17:42:15.456877 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:42:16.457944 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 17:42:16.457965 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 17:42:16.458005 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:42:16.458022 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:42:16.472361 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:42:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:42:23.409773 543705 memory.go:184] no items to output this cycle
I0320 17:42:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 17:42:25.442902 543705 disk_info.go:125] begin check local disk info of client
I0320 17:42:25.445345 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:42:25.445351 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab280 0xc0001ab2c0]
E0320 17:42:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:42:33.409770 543705 memory.go:184] no items to output this cycle
I0320 17:42:33.409789 543705 cpu.go:275] no items to output this cycle
I0320 17:42:38.572609 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 17:42:38.572616 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 17:42:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:42:43.410744 543705 memory.go:191] Add success.
I0320 17:42:43.409814 543705 cpu.go:282] Add success.
I0320 17:42:43.420524 543705 net.go:648] Add success.
I0320 17:42:43.423319 543705 net.go:770] primary dev: ETH0
I0320 17:42:43.423332 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:42:43.423347 543705 net.go:698] Add success.
I0320 17:42:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:42:46.458069 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:42:46.458099 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:42:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:42:53.409783 543705 memory.go:184] no items to output this cycle
I0320 17:42:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 17:43:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:43:03.409774 543705 memory.go:184] no items to output this cycle
I0320 17:43:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 17:43:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:43:13.409792 543705 memory.go:191] Add success.
W0320 17:43:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 17:43:13.409826 543705 cpu.go:282] Add success.
W0320 17:43:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:43:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:43:13.420156 543705 net.go:648] Add success.
I0320 17:43:13.422994 543705 net.go:770] primary dev: ETH0
I0320 17:43:13.423007 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:43:13.423019 543705 net.go:698] Add success.
I0320 17:43:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:43:14.455107 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:43:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0320 17:43:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:43:14.456513 543705 disk_worker.go:494] system disk:vda1
I0320 17:43:14.456558 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:43:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:43:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:43:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:43:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:43:16.472382 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:43:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:43:23.409773 543705 memory.go:184] no items to output this cycle
I0320 17:43:23.409786 543705 cpu.go:275] no items to output this cycle
I0320 17:43:25.445915 543705 disk_info.go:125] begin check local disk info of client
I0320 17:43:25.448376 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:43:25.448381 543705 disk_info.go:196] parse disk info done, disk is : [0xc000514500 0xc000514540]
E0320 17:43:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:43:33.409773 543705 memory.go:184] no items to output this cycle
I0320 17:43:33.409794 543705 cpu.go:275] no items to output this cycle
E0320 17:43:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:43:43.409794 543705 memory.go:191] Add success.
I0320 17:43:43.409810 543705 cpu.go:282] Add success.
I0320 17:43:43.420384 543705 net.go:648] Add success.
I0320 17:43:43.423450 543705 net.go:770] primary dev: ETH0
I0320 17:43:43.423462 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:43:43.423476 543705 net.go:698] Add success.
I0320 17:43:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:43:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:43:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:43:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:43:53.409768 543705 memory.go:184] no items to output this cycle
I0320 17:43:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 17:44:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:44:03.409800 543705 cpu.go:275] no items to output this cycle
I0320 17:44:03.409808 543705 memory.go:184] no items to output this cycle
E0320 17:44:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:44:13.409790 543705 memory.go:191] Add success.
I0320 17:44:13.409790 543705 cpu.go:282] Add success.
W0320 17:44:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:44:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:44:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:44:13.419713 543705 net.go:648] Add success.
I0320 17:44:13.422847 543705 net.go:770] primary dev: ETH0
I0320 17:44:13.422860 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:44:13.422871 543705 net.go:698] Add success.
I0320 17:44:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:44:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:44:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 17:44:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:44:14.456538 543705 disk_worker.go:494] system disk:vda1
I0320 17:44:14.456568 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:44:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:44:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:44:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:44:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:44:16.472398 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:44:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:44:23.409766 543705 memory.go:184] no items to output this cycle
I0320 17:44:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 17:44:25.448937 543705 disk_info.go:125] begin check local disk info of client
I0320 17:44:25.451390 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:44:25.451396 543705 disk_info.go:196] parse disk info done, disk is : [0xc000515300 0xc000515340]
E0320 17:44:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:44:33.409780 543705 memory.go:184] no items to output this cycle
I0320 17:44:33.409785 543705 cpu.go:275] no items to output this cycle
E0320 17:44:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:44:43.409815 543705 memory.go:191] Add success.
I0320 17:44:43.409826 543705 cpu.go:282] Add success.
I0320 17:44:43.419886 543705 net.go:648] Add success.
I0320 17:44:43.422798 543705 net.go:770] primary dev: ETH0
I0320 17:44:43.422814 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:44:43.422828 543705 net.go:698] Add success.
I0320 17:44:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:44:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:44:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:44:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:44:53.409770 543705 memory.go:184] no items to output this cycle
I0320 17:44:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 17:45:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:45:03.409773 543705 memory.go:184] no items to output this cycle
I0320 17:45:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 17:45:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:45:13.409812 543705 memory.go:191] Add success.
I0320 17:45:13.409813 543705 cpu.go:282] Add success.
W0320 17:45:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:45:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:45:13.409854 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:45:13.419708 543705 net.go:648] Add success.
I0320 17:45:13.422410 543705 net.go:770] primary dev: ETH0
I0320 17:45:13.422426 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:45:13.422439 543705 net.go:698] Add success.
I0320 17:45:13.469077 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9e5c0ba3-df38-4c79-8570-c9a9f99fec55","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 17:45:13.469107 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 17:45:14.454981 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:45:14.455149 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:45:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0320 17:45:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:45:14.456490 543705 disk_worker.go:494] system disk:vda1
I0320 17:45:14.456535 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:45:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:45:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:45:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:45:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:45:16.472411 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:45:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:45:23.409793 543705 memory.go:184] no items to output this cycle
I0320 17:45:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 17:45:25.451941 543705 disk_info.go:125] begin check local disk info of client
I0320 17:45:25.454396 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:45:25.454402 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa940 0xc0001aa980]
E0320 17:45:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:45:33.409774 543705 memory.go:184] no items to output this cycle
I0320 17:45:33.409795 543705 cpu.go:275] no items to output this cycle
I0320 17:45:38.573620 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 17:45:38.573626 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 17:45:43.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:45:43.410648 543705 memory.go:191] Add success.
I0320 17:45:43.409811 543705 cpu.go:282] Add success.
I0320 17:45:43.420419 543705 net.go:648] Add success.
I0320 17:45:43.422957 543705 net.go:770] primary dev: ETH0
I0320 17:45:43.422971 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:45:43.422984 543705 net.go:698] Add success.
I0320 17:45:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:45:46.458062 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:45:46.458092 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:45:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:45:53.409770 543705 memory.go:184] no items to output this cycle
I0320 17:45:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 17:46:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:46:03.409790 543705 memory.go:184] no items to output this cycle
I0320 17:46:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 17:46:13.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:46:13.409778 543705 memory.go:191] Add success.
I0320 17:46:13.409802 543705 cpu.go:282] Add success.
W0320 17:46:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:46:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:46:13.409819 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:46:13.419730 543705 net.go:648] Add success.
I0320 17:46:13.422678 543705 net.go:770] primary dev: ETH0
I0320 17:46:13.422693 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:46:13.422705 543705 net.go:698] Add success.
I0320 17:46:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:46:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:46:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0320 17:46:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:46:14.456600 543705 disk_worker.go:494] system disk:vda1
I0320 17:46:14.456630 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:46:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:46:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:46:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:46:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:46:16.472382 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:46:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:46:23.409773 543705 cpu.go:275] no items to output this cycle
I0320 17:46:23.409777 543705 memory.go:184] no items to output this cycle
I0320 17:46:25.454964 543705 disk_info.go:125] begin check local disk info of client
I0320 17:46:25.457382 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:46:25.457388 543705 disk_info.go:196] parse disk info done, disk is : [0xc000514e40 0xc000514e80]
E0320 17:46:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:46:33.409764 543705 memory.go:184] no items to output this cycle
I0320 17:46:33.409787 543705 cpu.go:275] no items to output this cycle
E0320 17:46:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:46:43.409812 543705 memory.go:191] Add success.
I0320 17:46:43.409817 543705 cpu.go:282] Add success.
I0320 17:46:43.419869 543705 net.go:770] primary dev: ETH0
I0320 17:46:43.419882 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:46:43.419894 543705 net.go:698] Add success.
I0320 17:46:43.420243 543705 net.go:648] Add success.
I0320 17:46:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:46:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:46:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:46:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:46:53.409784 543705 memory.go:184] no items to output this cycle
I0320 17:46:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 17:47:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:47:03.409786 543705 memory.go:184] no items to output this cycle
I0320 17:47:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 17:47:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:47:13.409818 543705 memory.go:191] Add success.
I0320 17:47:13.409823 543705 cpu.go:282] Add success.
W0320 17:47:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:47:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:47:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:47:13.419718 543705 net.go:648] Add success.
I0320 17:47:13.422342 543705 net.go:770] primary dev: ETH0
I0320 17:47:13.422356 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:47:13.422369 543705 net.go:698] Add success.
I0320 17:47:13.452920 543705 event_worker.go:152] Polling the log file for events...
W0320 17:47:14.455097 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:47:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0320 17:47:14.455161 543705 disk_worker.go:728] disk inode is not compliant
E0320 17:47:14.456924 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 17:47:14.456933 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 17:47:14.456939 543705 custom_config.go:64] query custom config with name: gpu
I0320 17:47:14.456981 543705 disk_worker.go:494] system disk:vda1
I0320 17:47:14.457007 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 17:47:15.456837 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 17:47:15.456846 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:47:16.457927 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 17:47:16.457929 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 17:47:16.457983 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:47:16.458001 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:47:16.472322 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:47:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:47:23.409793 543705 memory.go:184] no items to output this cycle
I0320 17:47:23.409802 543705 cpu.go:275] no items to output this cycle
I0320 17:47:25.457982 543705 disk_info.go:125] begin check local disk info of client
I0320 17:47:25.460378 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:47:25.460384 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab200 0xc0001ab240]
E0320 17:47:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:47:33.409768 543705 memory.go:184] no items to output this cycle
I0320 17:47:33.409797 543705 cpu.go:275] no items to output this cycle
E0320 17:47:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:47:43.409809 543705 memory.go:191] Add success.
I0320 17:47:43.409813 543705 cpu.go:282] Add success.
I0320 17:47:43.419895 543705 net.go:648] Add success.
I0320 17:47:43.422940 543705 net.go:770] primary dev: ETH0
I0320 17:47:43.422953 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:47:43.422966 543705 net.go:698] Add success.
I0320 17:47:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:47:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:47:46.458088 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:47:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:47:53.409803 543705 memory.go:184] no items to output this cycle
I0320 17:47:53.409814 543705 cpu.go:275] no items to output this cycle
E0320 17:48:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:48:03.409780 543705 memory.go:184] no items to output this cycle
I0320 17:48:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 17:48:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:48:13.409782 543705 memory.go:191] Add success.
W0320 17:48:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 17:48:13.409812 543705 cpu.go:282] Add success.
W0320 17:48:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:48:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:48:13.420225 543705 net.go:648] Add success.
I0320 17:48:13.422933 543705 net.go:770] primary dev: ETH0
I0320 17:48:13.422945 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:48:13.422956 543705 net.go:698] Add success.
I0320 17:48:13.568093 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1e835470-164e-4336-b1ea-bb34315be429","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 17:48:13.568136 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 17:48:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:48:14.455152 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:48:14.455163 543705 disk_worker.go:708] disk space is not compliant
W0320 17:48:14.455166 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:48:14.456519 543705 disk_worker.go:494] system disk:vda1
I0320 17:48:14.456562 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:48:15.455978 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:48:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:48:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:48:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:48:16.472384 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:48:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:48:23.409778 543705 cpu.go:275] no items to output this cycle
I0320 17:48:23.409785 543705 memory.go:184] no items to output this cycle
I0320 17:48:25.460992 543705 disk_info.go:125] begin check local disk info of client
I0320 17:48:25.463466 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:48:25.463472 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5dc0 0xc0000c5e00]
E0320 17:48:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:48:33.409798 543705 memory.go:184] no items to output this cycle
I0320 17:48:33.409811 543705 cpu.go:275] no items to output this cycle
I0320 17:48:38.574617 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 17:48:38.574624 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 17:48:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:48:43.410834 543705 memory.go:191] Add success.
I0320 17:48:43.409836 543705 cpu.go:282] Add success.
I0320 17:48:43.420542 543705 net.go:648] Add success.
I0320 17:48:43.423404 543705 net.go:770] primary dev: ETH0
I0320 17:48:43.423416 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:48:43.423429 543705 net.go:698] Add success.
I0320 17:48:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:48:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:48:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:48:53.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:48:53.409809 543705 memory.go:184] no items to output this cycle
I0320 17:48:53.409819 543705 cpu.go:275] no items to output this cycle
E0320 17:49:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:49:03.409776 543705 memory.go:184] no items to output this cycle
I0320 17:49:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 17:49:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:49:13.409804 543705 memory.go:191] Add success.
I0320 17:49:13.409813 543705 cpu.go:282] Add success.
W0320 17:49:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:49:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:49:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:49:13.419711 543705 net.go:648] Add success.
I0320 17:49:13.422805 543705 net.go:770] primary dev: ETH0
I0320 17:49:13.422818 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:49:13.422829 543705 net.go:698] Add success.
I0320 17:49:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:49:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:49:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 17:49:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:49:14.456509 543705 disk_worker.go:494] system disk:vda1
I0320 17:49:14.456552 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:49:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:49:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:49:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:49:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:49:16.472407 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:49:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:49:23.409779 543705 cpu.go:275] no items to output this cycle
I0320 17:49:23.409780 543705 memory.go:184] no items to output this cycle
I0320 17:49:25.464010 543705 disk_info.go:125] begin check local disk info of client
I0320 17:49:25.466479 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:49:25.466485 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048a3c0 0xc00048a400]
E0320 17:49:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:49:33.409766 543705 memory.go:184] no items to output this cycle
I0320 17:49:33.409790 543705 cpu.go:275] no items to output this cycle
E0320 17:49:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:49:43.409811 543705 memory.go:191] Add success.
I0320 17:49:43.409821 543705 cpu.go:282] Add success.
I0320 17:49:43.419868 543705 net.go:648] Add success.
I0320 17:49:43.422748 543705 net.go:770] primary dev: ETH0
I0320 17:49:43.422762 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:49:43.422773 543705 net.go:698] Add success.
I0320 17:49:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:49:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:49:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:49:53.410342 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:49:53.410359 543705 memory.go:184] no items to output this cycle
I0320 17:49:53.410372 543705 cpu.go:275] no items to output this cycle
E0320 17:50:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:50:03.409803 543705 memory.go:184] no items to output this cycle
I0320 17:50:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 17:50:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:50:13.409779 543705 memory.go:191] Add success.
I0320 17:50:13.409797 543705 cpu.go:282] Add success.
W0320 17:50:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:50:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:50:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:50:13.420351 543705 net.go:648] Add success.
I0320 17:50:13.422997 543705 net.go:770] primary dev: ETH0
I0320 17:50:13.423009 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:50:13.423021 543705 net.go:698] Add success.
I0320 17:50:14.454954 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:50:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:50:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 17:50:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:50:14.456581 543705 disk_worker.go:494] system disk:vda1
I0320 17:50:14.456611 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:50:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:50:16.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:50:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:50:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:50:16.472417 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:50:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:50:23.409769 543705 memory.go:184] no items to output this cycle
I0320 17:50:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 17:50:25.467027 543705 disk_info.go:125] begin check local disk info of client
I0320 17:50:25.469454 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:50:25.469460 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003d5600 0xc0003d5640]
E0320 17:50:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:50:33.409780 543705 memory.go:184] no items to output this cycle
I0320 17:50:33.409799 543705 cpu.go:275] no items to output this cycle
E0320 17:50:43.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:50:43.409828 543705 memory.go:191] Add success.
I0320 17:50:43.409830 543705 cpu.go:282] Add success.
I0320 17:50:43.420009 543705 net.go:648] Add success.
I0320 17:50:43.423338 543705 net.go:770] primary dev: ETH0
I0320 17:50:43.423351 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:50:43.423363 543705 net.go:698] Add success.
I0320 17:50:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:50:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:50:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:50:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:50:53.409774 543705 memory.go:184] no items to output this cycle
I0320 17:50:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 17:51:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:51:03.409802 543705 memory.go:184] no items to output this cycle
I0320 17:51:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 17:51:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:51:13.409811 543705 memory.go:191] Add success.
I0320 17:51:13.409822 543705 cpu.go:282] Add success.
W0320 17:51:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:51:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:51:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:51:13.420044 543705 net.go:648] Add success.
I0320 17:51:13.423077 543705 net.go:770] primary dev: ETH0
I0320 17:51:13.423090 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:51:13.423103 543705 net.go:698] Add success.
I0320 17:51:13.463998 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"64a608cf-cb22-419b-ac32-f5fca4ff6edb","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 17:51:13.464031 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 17:51:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:51:14.455386 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:51:14.455420 543705 disk_worker.go:708] disk space is not compliant
W0320 17:51:14.455424 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:51:14.457055 543705 disk_worker.go:494] system disk:vda1
I0320 17:51:14.457088 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:51:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:51:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:51:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:51:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:51:16.472408 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:51:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:51:23.409776 543705 cpu.go:275] no items to output this cycle
I0320 17:51:23.409778 543705 memory.go:184] no items to output this cycle
I0320 17:51:25.470044 543705 disk_info.go:125] begin check local disk info of client
I0320 17:51:25.472477 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:51:25.472483 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4880 0xc0000c48c0]
E0320 17:51:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:51:33.409771 543705 memory.go:184] no items to output this cycle
I0320 17:51:33.409790 543705 cpu.go:275] no items to output this cycle
I0320 17:51:38.575629 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 17:51:38.575636 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 17:51:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:51:43.410651 543705 memory.go:191] Add success.
I0320 17:51:43.409791 543705 cpu.go:282] Add success.
I0320 17:51:43.420341 543705 net.go:648] Add success.
I0320 17:51:43.423115 543705 net.go:770] primary dev: ETH0
I0320 17:51:43.423128 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:51:43.423143 543705 net.go:698] Add success.
I0320 17:51:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:51:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:51:46.458088 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:51:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:51:53.409785 543705 memory.go:184] no items to output this cycle
I0320 17:51:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 17:52:03.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:52:03.409816 543705 memory.go:184] no items to output this cycle
I0320 17:52:03.409828 543705 cpu.go:275] no items to output this cycle
E0320 17:52:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:52:13.409816 543705 memory.go:191] Add success.
I0320 17:52:13.409821 543705 cpu.go:282] Add success.
W0320 17:52:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:52:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:52:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:52:13.420110 543705 net.go:648] Add success.
I0320 17:52:13.422846 543705 net.go:770] primary dev: ETH0
I0320 17:52:13.422860 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:52:13.422871 543705 net.go:698] Add success.
W0320 17:52:14.455094 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:52:14.455156 543705 disk_worker.go:708] disk space is not compliant
W0320 17:52:14.455159 543705 disk_worker.go:728] disk inode is not compliant
E0320 17:52:14.456165 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 17:52:14.456173 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 17:52:14.456178 543705 custom_config.go:64] query custom config with name: gpu
I0320 17:52:14.457594 543705 disk_worker.go:494] system disk:vda1
I0320 17:52:14.457622 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 17:52:15.456769 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 17:52:15.456778 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:52:16.457951 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 17:52:16.457951 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 17:52:16.458006 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:52:16.458025 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:52:16.472402 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:52:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:52:23.409797 543705 memory.go:184] no items to output this cycle
I0320 17:52:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 17:52:25.473047 543705 disk_info.go:125] begin check local disk info of client
I0320 17:52:25.475555 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:52:25.475561 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007bd80 0xc00007bdc0]
E0320 17:52:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:52:33.409788 543705 cpu.go:275] no items to output this cycle
I0320 17:52:33.409798 543705 memory.go:184] no items to output this cycle
E0320 17:52:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:52:43.409799 543705 memory.go:191] Add success.
I0320 17:52:43.409806 543705 cpu.go:282] Add success.
I0320 17:52:43.419892 543705 net.go:648] Add success.
I0320 17:52:43.423177 543705 net.go:770] primary dev: ETH0
I0320 17:52:43.423191 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:52:43.423206 543705 net.go:698] Add success.
I0320 17:52:46.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:52:46.458075 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:52:46.458108 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:52:53.409804 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:52:53.409826 543705 memory.go:184] no items to output this cycle
I0320 17:52:53.409839 543705 cpu.go:275] no items to output this cycle
E0320 17:53:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:53:03.409789 543705 memory.go:184] no items to output this cycle
I0320 17:53:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 17:53:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:53:13.409788 543705 memory.go:191] Add success.
I0320 17:53:13.409813 543705 cpu.go:282] Add success.
W0320 17:53:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:53:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:53:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:53:13.420165 543705 net.go:648] Add success.
I0320 17:53:13.422994 543705 net.go:770] primary dev: ETH0
I0320 17:53:13.423010 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:53:13.423024 543705 net.go:698] Add success.
I0320 17:53:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:53:14.455108 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:53:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0320 17:53:14.455179 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:53:14.456566 543705 disk_worker.go:494] system disk:vda1
I0320 17:53:14.456596 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:53:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:53:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:53:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:53:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:53:16.472396 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:53:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:53:23.409803 543705 memory.go:184] no items to output this cycle
I0320 17:53:23.409815 543705 cpu.go:275] no items to output this cycle
I0320 17:53:25.476071 543705 disk_info.go:125] begin check local disk info of client
I0320 17:53:25.478583 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:53:25.478588 543705 disk_info.go:196] parse disk info done, disk is : [0xc000464440 0xc000464480]
E0320 17:53:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:53:33.409810 543705 memory.go:184] no items to output this cycle
I0320 17:53:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 17:53:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:53:43.409827 543705 memory.go:191] Add success.
I0320 17:53:43.409836 543705 cpu.go:282] Add success.
I0320 17:53:43.420009 543705 net.go:648] Add success.
I0320 17:53:43.422719 543705 net.go:770] primary dev: ETH0
I0320 17:53:43.422734 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:53:43.422749 543705 net.go:698] Add success.
I0320 17:53:46.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:53:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:53:46.458088 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:53:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:53:53.409777 543705 memory.go:184] no items to output this cycle
I0320 17:53:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 17:54:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:54:03.409806 543705 memory.go:184] no items to output this cycle
I0320 17:54:03.409822 543705 cpu.go:275] no items to output this cycle
E0320 17:54:13.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:54:13.409774 543705 memory.go:191] Add success.
W0320 17:54:13.409800 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 17:54:13.409806 543705 cpu.go:282] Add success.
W0320 17:54:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:54:13.409815 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:54:13.420085 543705 net.go:648] Add success.
I0320 17:54:13.422873 543705 net.go:770] primary dev: ETH0
I0320 17:54:13.422891 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:54:13.422903 543705 net.go:698] Add success.
I0320 17:54:13.469013 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9ed705db-6996-4f4c-bec2-2d5311c7921e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 17:54:13.469047 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 17:54:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:54:14.455194 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:54:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0320 17:54:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:54:14.456607 543705 disk_worker.go:494] system disk:vda1
I0320 17:54:14.456643 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:54:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:54:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:54:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:54:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:54:16.472410 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:54:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:54:23.409771 543705 memory.go:184] no items to output this cycle
I0320 17:54:23.409789 543705 cpu.go:275] no items to output this cycle
I0320 17:54:25.479087 543705 disk_info.go:125] begin check local disk info of client
I0320 17:54:25.481531 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:54:25.481537 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b3c0 0xc00007b400]
E0320 17:54:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:54:33.409797 543705 memory.go:184] no items to output this cycle
I0320 17:54:33.409808 543705 cpu.go:275] no items to output this cycle
I0320 17:54:38.576623 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 17:54:38.576629 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 17:54:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:54:43.410652 543705 memory.go:191] Add success.
I0320 17:54:43.409794 543705 cpu.go:282] Add success.
I0320 17:54:43.420360 543705 net.go:648] Add success.
I0320 17:54:43.423222 543705 net.go:770] primary dev: ETH0
I0320 17:54:43.423235 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:54:43.423248 543705 net.go:698] Add success.
I0320 17:54:46.457993 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:54:46.458066 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:54:46.458094 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:54:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:54:53.409784 543705 memory.go:184] no items to output this cycle
I0320 17:54:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 17:55:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:55:03.409797 543705 memory.go:184] no items to output this cycle
I0320 17:55:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 17:55:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:55:13.409777 543705 memory.go:191] Add success.
I0320 17:55:13.409801 543705 cpu.go:282] Add success.
W0320 17:55:13.409803 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:55:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:55:13.409818 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:55:13.419993 543705 net.go:770] primary dev: ETH0
I0320 17:55:13.420009 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:55:13.420023 543705 net.go:698] Add success.
I0320 17:55:13.420375 543705 net.go:648] Add success.
I0320 17:55:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:55:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:55:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0320 17:55:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:55:14.456559 543705 disk_worker.go:494] system disk:vda1
I0320 17:55:14.456590 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:55:15.455974 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:55:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:55:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:55:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:55:16.472388 543705 disk_local_worker.go:436] Get disk info: []
I0320 17:55:23.409870 543705 cpu.go:275] no items to output this cycle
E0320 17:55:23.409868 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:55:23.409888 543705 memory.go:184] no items to output this cycle
I0320 17:55:25.482102 543705 disk_info.go:125] begin check local disk info of client
I0320 17:55:25.484531 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:55:25.484537 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ae780 0xc0003ae7c0]
E0320 17:55:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:55:33.409773 543705 memory.go:184] no items to output this cycle
I0320 17:55:33.409791 543705 cpu.go:275] no items to output this cycle
E0320 17:55:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:55:43.409817 543705 memory.go:191] Add success.
I0320 17:55:43.409817 543705 cpu.go:282] Add success.
I0320 17:55:43.419973 543705 net.go:648] Add success.
I0320 17:55:43.422625 543705 net.go:770] primary dev: ETH0
I0320 17:55:43.422639 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:55:43.422650 543705 net.go:698] Add success.
I0320 17:55:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:55:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:55:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:55:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:55:53.409770 543705 memory.go:184] no items to output this cycle
I0320 17:55:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 17:56:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:56:03.409778 543705 memory.go:184] no items to output this cycle
I0320 17:56:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 17:56:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:56:13.409789 543705 memory.go:191] Add success.
I0320 17:56:13.409792 543705 cpu.go:282] Add success.
W0320 17:56:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:56:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:56:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:56:13.420079 543705 net.go:648] Add success.
I0320 17:56:13.422900 543705 net.go:770] primary dev: ETH0
I0320 17:56:13.422915 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:56:13.422929 543705 net.go:698] Add success.
I0320 17:56:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:56:14.455168 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:56:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 17:56:14.455182 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:56:14.456504 543705 disk_worker.go:494] system disk:vda1
I0320 17:56:14.456547 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:56:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:56:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:56:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:56:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:56:16.472409 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:56:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:56:23.409774 543705 memory.go:184] no items to output this cycle
I0320 17:56:23.409795 543705 cpu.go:275] no items to output this cycle
I0320 17:56:25.485105 543705 disk_info.go:125] begin check local disk info of client
I0320 17:56:25.487556 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:56:25.487563 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035c000 0xc00035c040]
E0320 17:56:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:56:33.409795 543705 memory.go:184] no items to output this cycle
I0320 17:56:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 17:56:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:56:43.409797 543705 memory.go:191] Add success.
I0320 17:56:43.409812 543705 cpu.go:282] Add success.
I0320 17:56:43.419949 543705 net.go:648] Add success.
I0320 17:56:43.422712 543705 net.go:770] primary dev: ETH0
I0320 17:56:43.422725 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:56:43.422736 543705 net.go:698] Add success.
I0320 17:56:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:56:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:56:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:56:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:56:53.409784 543705 memory.go:184] no items to output this cycle
I0320 17:56:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 17:57:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:57:03.409772 543705 memory.go:184] no items to output this cycle
I0320 17:57:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 17:57:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:57:13.409805 543705 memory.go:191] Add success.
I0320 17:57:13.409815 543705 cpu.go:282] Add success.
W0320 17:57:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:57:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:57:13.409851 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:57:13.420187 543705 net.go:648] Add success.
I0320 17:57:13.422823 543705 net.go:770] primary dev: ETH0
I0320 17:57:13.422838 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:57:13.422853 543705 net.go:698] Add success.
I0320 17:57:13.429154 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 17:57:13.453327 543705 event_worker.go:152] Polling the log file for events...
I0320 17:57:13.468946 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"63f21e38-ddc2-4c37-8e3e-697da1673ee2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 17:57:13.468996 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 17:57:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:57:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 17:57:14.455192 543705 disk_worker.go:728] disk inode is not compliant
E0320 17:57:14.455880 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 17:57:14.455888 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 17:57:14.455893 543705 custom_config.go:64] query custom config with name: gpu
I0320 17:57:14.456547 543705 disk_worker.go:494] system disk:vda1
I0320 17:57:14.456577 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 17:57:15.456866 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 17:57:15.456875 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:57:16.457907 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 17:57:16.457907 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 17:57:16.457960 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:57:16.457979 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:57:16.472311 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:57:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:57:23.409800 543705 memory.go:184] no items to output this cycle
I0320 17:57:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 17:57:25.488126 543705 disk_info.go:125] begin check local disk info of client
I0320 17:57:25.490649 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:57:25.490656 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000da100 0xc0000da180]
E0320 17:57:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:57:33.409781 543705 memory.go:184] no items to output this cycle
I0320 17:57:33.409791 543705 cpu.go:275] no items to output this cycle
I0320 17:57:38.577639 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 17:57:38.577659 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 17:57:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:57:43.410874 543705 memory.go:191] Add success.
I0320 17:57:43.409821 543705 cpu.go:282] Add success.
I0320 17:57:43.420621 543705 net.go:648] Add success.
I0320 17:57:43.423149 543705 net.go:770] primary dev: ETH0
I0320 17:57:43.423162 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:57:43.423174 543705 net.go:698] Add success.
I0320 17:57:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:57:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:57:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:57:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:57:53.409773 543705 memory.go:184] no items to output this cycle
I0320 17:57:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 17:58:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:58:03.409783 543705 memory.go:184] no items to output this cycle
I0320 17:58:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 17:58:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:58:13.409800 543705 memory.go:191] Add success.
I0320 17:58:13.409818 543705 cpu.go:282] Add success.
W0320 17:58:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:58:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:58:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:58:13.420109 543705 net.go:648] Add success.
I0320 17:58:13.423111 543705 net.go:770] primary dev: ETH0
I0320 17:58:13.423125 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:58:13.423138 543705 net.go:698] Add success.
I0320 17:58:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:58:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:58:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 17:58:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:58:14.456593 543705 disk_worker.go:494] system disk:vda1
I0320 17:58:14.456623 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:58:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:58:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:58:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:58:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:58:16.472403 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:58:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:58:23.409763 543705 memory.go:184] no items to output this cycle
I0320 17:58:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 17:58:25.490731 543705 disk_info.go:125] begin check local disk info of client
I0320 17:58:25.493434 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:58:25.493441 543705 disk_info.go:196] parse disk info done, disk is : [0xc000304000 0xc000304040]
E0320 17:58:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:58:33.409795 543705 memory.go:184] no items to output this cycle
I0320 17:58:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 17:58:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:58:43.409811 543705 memory.go:191] Add success.
I0320 17:58:43.409858 543705 cpu.go:282] Add success.
I0320 17:58:43.420179 543705 net.go:648] Add success.
I0320 17:58:43.423047 543705 net.go:770] primary dev: ETH0
I0320 17:58:43.423061 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:58:43.423073 543705 net.go:698] Add success.
I0320 17:58:46.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:58:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:58:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:58:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:58:53.409772 543705 memory.go:184] no items to output this cycle
I0320 17:58:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 17:59:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:59:03.409808 543705 memory.go:184] no items to output this cycle
I0320 17:59:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 17:59:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:59:13.409792 543705 cpu.go:282] Add success.
I0320 17:59:13.409801 543705 memory.go:191] Add success.
W0320 17:59:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 17:59:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 17:59:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 17:59:13.420046 543705 net.go:648] Add success.
I0320 17:59:13.422764 543705 net.go:770] primary dev: ETH0
I0320 17:59:13.422778 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:59:13.422790 543705 net.go:698] Add success.
I0320 17:59:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 17:59:14.455202 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 17:59:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0320 17:59:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0320 17:59:14.456600 543705 disk_worker.go:494] system disk:vda1
I0320 17:59:14.456627 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 17:59:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 17:59:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:59:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:59:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 17:59:16.472360 543705 disk_local_worker.go:436] Get disk info: []
E0320 17:59:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:59:23.409772 543705 memory.go:184] no items to output this cycle
I0320 17:59:23.409787 543705 cpu.go:275] no items to output this cycle
I0320 17:59:25.494164 543705 disk_info.go:125] begin check local disk info of client
I0320 17:59:25.496639 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 17:59:25.496645 543705 disk_info.go:196] parse disk info done, disk is : [0xc000322000 0xc000322040]
E0320 17:59:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:59:33.409807 543705 memory.go:184] no items to output this cycle
I0320 17:59:33.409824 543705 cpu.go:275] no items to output this cycle
E0320 17:59:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:59:43.409790 543705 memory.go:191] Add success.
I0320 17:59:43.409801 543705 cpu.go:282] Add success.
I0320 17:59:43.419948 543705 net.go:648] Add success.
I0320 17:59:43.422659 543705 net.go:770] primary dev: ETH0
I0320 17:59:43.422671 543705 net.go:802] Send network stats successfully!,count is 6
I0320 17:59:43.422684 543705 net.go:698] Add success.
I0320 17:59:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 17:59:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 17:59:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0320 17:59:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 17:59:53.409777 543705 memory.go:184] no items to output this cycle
I0320 17:59:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 18:00:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:00:03.409778 543705 memory.go:184] no items to output this cycle
I0320 18:00:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 18:00:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:00:13.409781 543705 memory.go:191] Add success.
I0320 18:00:13.409798 543705 cpu.go:282] Add success.
W0320 18:00:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:00:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:00:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:00:13.420342 543705 net.go:648] Add success.
I0320 18:00:13.423043 543705 net.go:770] primary dev: ETH0
I0320 18:00:13.423055 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:00:13.423067 543705 net.go:698] Add success.
I0320 18:00:13.469107 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1f285814-3ac8-4e86-8228-02e751ee204a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 18:00:13.469146 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 18:00:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:00:14.455131 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:00:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0320 18:00:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:00:14.456591 543705 disk_worker.go:494] system disk:vda1
I0320 18:00:14.456622 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:00:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:00:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:00:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:00:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:00:16.472373 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:00:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:00:23.409883 543705 memory.go:184] no items to output this cycle
I0320 18:00:23.409860 543705 cpu.go:275] no items to output this cycle
I0320 18:00:25.497175 543705 disk_info.go:125] begin check local disk info of client
I0320 18:00:25.499660 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:00:25.499666 543705 disk_info.go:196] parse disk info done, disk is : [0xc000279b00 0xc000279b40]
E0320 18:00:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:00:33.409799 543705 memory.go:184] no items to output this cycle
I0320 18:00:33.409810 543705 cpu.go:275] no items to output this cycle
I0320 18:00:38.578650 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 18:00:38.578657 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 18:00:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:00:43.410777 543705 memory.go:191] Add success.
I0320 18:00:43.409793 543705 cpu.go:282] Add success.
I0320 18:00:43.420482 543705 net.go:648] Add success.
I0320 18:00:43.423050 543705 net.go:770] primary dev: ETH0
I0320 18:00:43.423064 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:00:43.423077 543705 net.go:698] Add success.
I0320 18:00:46.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:00:46.458065 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:00:46.458094 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:00:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:00:53.409786 543705 memory.go:184] no items to output this cycle
I0320 18:00:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 18:01:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:01:03.409802 543705 memory.go:184] no items to output this cycle
I0320 18:01:03.409815 543705 cpu.go:275] no items to output this cycle
E0320 18:01:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:01:13.409790 543705 memory.go:191] Add success.
I0320 18:01:13.409811 543705 cpu.go:282] Add success.
W0320 18:01:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:01:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:01:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:01:13.420192 543705 net.go:648] Add success.
I0320 18:01:13.422704 543705 net.go:770] primary dev: ETH0
I0320 18:01:13.422720 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:01:13.422735 543705 net.go:698] Add success.
I0320 18:01:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:01:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:01:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 18:01:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:01:14.456553 543705 disk_worker.go:494] system disk:vda1
I0320 18:01:14.456582 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:01:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:01:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:01:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:01:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:01:16.472404 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:01:23.409938 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:01:23.409957 543705 memory.go:184] no items to output this cycle
I0320 18:01:23.409938 543705 cpu.go:275] no items to output this cycle
I0320 18:01:25.499748 543705 disk_info.go:125] begin check local disk info of client
I0320 18:01:25.502211 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:01:25.502217 543705 disk_info.go:196] parse disk info done, disk is : [0xc000545e00 0xc000545e40]
E0320 18:01:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:01:33.409784 543705 memory.go:184] no items to output this cycle
I0320 18:01:33.409803 543705 cpu.go:275] no items to output this cycle
E0320 18:01:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:01:43.409812 543705 memory.go:191] Add success.
I0320 18:01:43.409825 543705 cpu.go:282] Add success.
I0320 18:01:43.420051 543705 net.go:648] Add success.
I0320 18:01:43.422897 543705 net.go:770] primary dev: ETH0
I0320 18:01:43.422912 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:01:43.422925 543705 net.go:698] Add success.
I0320 18:01:46.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:01:46.458074 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:01:46.458103 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:01:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:01:53.409778 543705 memory.go:184] no items to output this cycle
I0320 18:01:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 18:02:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:02:03.409811 543705 memory.go:184] no items to output this cycle
I0320 18:02:03.409823 543705 cpu.go:275] no items to output this cycle
E0320 18:02:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:02:13.409809 543705 memory.go:191] Add success.
I0320 18:02:13.409814 543705 cpu.go:282] Add success.
W0320 18:02:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:02:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:02:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:02:13.420029 543705 net.go:648] Add success.
I0320 18:02:13.422504 543705 net.go:770] primary dev: ETH0
I0320 18:02:13.422516 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:02:13.422529 543705 net.go:698] Add success.
W0320 18:02:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:02:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0320 18:02:14.455176 543705 disk_worker.go:728] disk inode is not compliant
E0320 18:02:14.456916 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 18:02:14.456926 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 18:02:14.456932 543705 custom_config.go:64] query custom config with name: gpu
I0320 18:02:14.456984 543705 disk_worker.go:494] system disk:vda1
I0320 18:02:14.457014 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 18:02:15.456827 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 18:02:15.456836 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:02:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 18:02:16.457972 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 18:02:16.458024 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:02:16.458044 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:02:16.472372 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:02:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:02:23.409791 543705 memory.go:184] no items to output this cycle
I0320 18:02:23.409800 543705 cpu.go:275] no items to output this cycle
I0320 18:02:25.503203 543705 disk_info.go:125] begin check local disk info of client
I0320 18:02:25.505704 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:02:25.505710 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa200 0xc0001fa240]
E0320 18:02:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:02:33.409782 543705 memory.go:184] no items to output this cycle
I0320 18:02:33.409809 543705 cpu.go:275] no items to output this cycle
E0320 18:02:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:02:43.409791 543705 memory.go:191] Add success.
I0320 18:02:43.409797 543705 cpu.go:282] Add success.
I0320 18:02:43.419975 543705 net.go:648] Add success.
I0320 18:02:43.422937 543705 net.go:770] primary dev: ETH0
I0320 18:02:43.422950 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:02:43.422963 543705 net.go:698] Add success.
I0320 18:02:46.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:02:46.458063 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:02:46.458092 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:02:53.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:02:53.409818 543705 memory.go:184] no items to output this cycle
I0320 18:02:53.409828 543705 cpu.go:275] no items to output this cycle
E0320 18:03:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:03:03.409805 543705 memory.go:184] no items to output this cycle
I0320 18:03:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 18:03:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:03:13.409787 543705 memory.go:191] Add success.
I0320 18:03:13.409809 543705 cpu.go:282] Add success.
W0320 18:03:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:03:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:03:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:03:13.420080 543705 net.go:648] Add success.
I0320 18:03:13.423436 543705 net.go:770] primary dev: ETH0
I0320 18:03:13.423450 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:03:13.423464 543705 net.go:698] Add success.
I0320 18:03:13.468453 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"fc8fe5dc-35fa-4c0f-ad56-88b225efd54e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 18:03:13.468485 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 18:03:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:03:14.455233 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:03:14.455244 543705 disk_worker.go:708] disk space is not compliant
W0320 18:03:14.455248 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:03:14.456769 543705 disk_worker.go:494] system disk:vda1
I0320 18:03:14.456802 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:03:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:03:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:03:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:03:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:03:16.472091 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:03:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:03:23.409799 543705 memory.go:184] no items to output this cycle
I0320 18:03:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 18:03:25.506171 543705 disk_info.go:125] begin check local disk info of client
I0320 18:03:25.508644 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:03:25.508651 543705 disk_info.go:196] parse disk info done, disk is : [0xc000323c80 0xc000323cc0]
E0320 18:03:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:03:33.409786 543705 memory.go:184] no items to output this cycle
I0320 18:03:33.409797 543705 cpu.go:275] no items to output this cycle
I0320 18:03:38.579705 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 18:03:38.579713 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 18:03:43.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:03:43.410743 543705 memory.go:191] Add success.
I0320 18:03:43.409801 543705 cpu.go:282] Add success.
I0320 18:03:43.420663 543705 net.go:648] Add success.
I0320 18:03:43.423321 543705 net.go:770] primary dev: ETH0
I0320 18:03:43.423334 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:03:43.423346 543705 net.go:698] Add success.
I0320 18:03:46.457993 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:03:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:03:46.458089 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:03:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:03:53.409781 543705 memory.go:184] no items to output this cycle
I0320 18:03:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 18:04:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:04:03.409789 543705 memory.go:184] no items to output this cycle
I0320 18:04:03.409799 543705 cpu.go:275] no items to output this cycle
W0320 18:04:13.409704 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:04:13.409719 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:04:13.409724 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0320 18:04:13.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:04:13.409815 543705 memory.go:191] Add success.
I0320 18:04:13.409824 543705 cpu.go:282] Add success.
I0320 18:04:13.419970 543705 net.go:648] Add success.
I0320 18:04:13.422481 543705 net.go:770] primary dev: ETH0
I0320 18:04:13.422494 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:04:13.422506 543705 net.go:698] Add success.
I0320 18:04:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:04:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:04:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0320 18:04:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:04:14.456592 543705 disk_worker.go:494] system disk:vda1
I0320 18:04:14.456622 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:04:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:04:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:04:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:04:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:04:16.472382 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:04:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:04:23.409796 543705 memory.go:184] no items to output this cycle
I0320 18:04:23.409805 543705 cpu.go:275] no items to output this cycle
I0320 18:04:25.509239 543705 disk_info.go:125] begin check local disk info of client
I0320 18:04:25.511813 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:04:25.511820 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003d4000 0xc0003d4040]
E0320 18:04:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:04:33.409769 543705 memory.go:184] no items to output this cycle
I0320 18:04:33.409803 543705 cpu.go:275] no items to output this cycle
E0320 18:04:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:04:43.409789 543705 memory.go:191] Add success.
I0320 18:04:43.409825 543705 cpu.go:282] Add success.
I0320 18:04:43.419994 543705 net.go:648] Add success.
I0320 18:04:43.423036 543705 net.go:770] primary dev: ETH0
I0320 18:04:43.423050 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:04:43.423064 543705 net.go:698] Add success.
I0320 18:04:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:04:46.458070 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:04:46.458100 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:04:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:04:53.409777 543705 cpu.go:275] no items to output this cycle
I0320 18:04:53.409787 543705 memory.go:184] no items to output this cycle
E0320 18:05:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:05:03.409790 543705 memory.go:184] no items to output this cycle
I0320 18:05:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 18:05:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:05:13.409789 543705 memory.go:191] Add success.
I0320 18:05:13.409790 543705 cpu.go:282] Add success.
W0320 18:05:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:05:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:05:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:05:13.420039 543705 net.go:648] Add success.
I0320 18:05:13.422905 543705 net.go:770] primary dev: ETH0
I0320 18:05:13.422919 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:05:13.422931 543705 net.go:698] Add success.
I0320 18:05:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:05:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:05:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 18:05:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:05:14.456560 543705 disk_worker.go:494] system disk:vda1
I0320 18:05:14.456590 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:05:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:05:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:05:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:05:16.458049 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:05:16.472373 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:05:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:05:23.409778 543705 memory.go:184] no items to output this cycle
I0320 18:05:23.409781 543705 cpu.go:275] no items to output this cycle
I0320 18:05:25.512199 543705 disk_info.go:125] begin check local disk info of client
I0320 18:05:25.514657 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:05:25.514663 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003c9ac0 0xc0003c9b00]
E0320 18:05:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:05:33.409800 543705 memory.go:184] no items to output this cycle
I0320 18:05:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 18:05:43.409872 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:05:43.409912 543705 memory.go:191] Add success.
I0320 18:05:43.409957 543705 cpu.go:282] Add success.
I0320 18:05:43.419755 543705 net.go:648] Add success.
I0320 18:05:43.422821 543705 net.go:770] primary dev: ETH0
I0320 18:05:43.422836 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:05:43.422847 543705 net.go:698] Add success.
I0320 18:05:46.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:05:46.458067 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:05:46.458095 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:05:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:05:53.409785 543705 memory.go:184] no items to output this cycle
I0320 18:05:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 18:06:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:06:03.409786 543705 memory.go:184] no items to output this cycle
I0320 18:06:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 18:06:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:06:13.409816 543705 memory.go:191] Add success.
I0320 18:06:13.409825 543705 cpu.go:282] Add success.
W0320 18:06:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:06:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:06:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:06:13.420133 543705 net.go:648] Add success.
I0320 18:06:13.423282 543705 net.go:770] primary dev: ETH0
I0320 18:06:13.423295 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:06:13.423307 543705 net.go:698] Add success.
I0320 18:06:13.536752 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"46350abc-16b9-426a-9d46-152add540df1","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 18:06:13.536786 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 18:06:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:06:14.455191 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:06:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0320 18:06:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:06:14.456741 543705 disk_worker.go:494] system disk:vda1
I0320 18:06:14.456776 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:06:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:06:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:06:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:06:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:06:16.472387 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:06:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:06:23.409790 543705 memory.go:184] no items to output this cycle
I0320 18:06:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 18:06:25.515265 543705 disk_info.go:125] begin check local disk info of client
I0320 18:06:25.517826 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:06:25.517834 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003dfe80 0xc0003b8000]
E0320 18:06:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:06:33.409800 543705 memory.go:184] no items to output this cycle
I0320 18:06:33.409808 543705 cpu.go:275] no items to output this cycle
I0320 18:06:38.579862 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 18:06:38.579869 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 18:06:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:06:43.410944 543705 memory.go:191] Add success.
I0320 18:06:43.409824 543705 cpu.go:282] Add success.
I0320 18:06:43.420660 543705 net.go:648] Add success.
I0320 18:06:43.423601 543705 net.go:770] primary dev: ETH0
I0320 18:06:43.423615 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:06:43.423627 543705 net.go:698] Add success.
I0320 18:06:46.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:06:46.458064 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:06:46.458095 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:06:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:06:53.409784 543705 memory.go:184] no items to output this cycle
I0320 18:06:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 18:07:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:07:03.409783 543705 memory.go:184] no items to output this cycle
I0320 18:07:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 18:07:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:07:13.409803 543705 memory.go:191] Add success.
I0320 18:07:13.409813 543705 cpu.go:282] Add success.
W0320 18:07:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:07:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:07:13.409854 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:07:13.420112 543705 net.go:648] Add success.
I0320 18:07:13.422708 543705 net.go:770] primary dev: ETH0
I0320 18:07:13.422724 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:07:13.422738 543705 net.go:698] Add success.
I0320 18:07:13.453299 543705 event_worker.go:152] Polling the log file for events...
W0320 18:07:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:07:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0320 18:07:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:07:14.456762 543705 disk_worker.go:494] system disk:vda1
I0320 18:07:14.456801 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 18:07:14.457135 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 18:07:14.457143 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 18:07:14.457148 543705 custom_config.go:64] query custom config with name: gpu
E0320 18:07:15.456805 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 18:07:15.456812 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:07:16.457952 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 18:07:16.457952 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 18:07:16.458007 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:07:16.458028 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:07:16.472348 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:07:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:07:23.409794 543705 memory.go:184] no items to output this cycle
I0320 18:07:23.409799 543705 cpu.go:275] no items to output this cycle
I0320 18:07:25.518229 543705 disk_info.go:125] begin check local disk info of client
I0320 18:07:25.520716 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:07:25.520723 543705 disk_info.go:196] parse disk info done, disk is : [0xc00049cfc0 0xc00049d000]
E0320 18:07:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:07:33.409772 543705 memory.go:184] no items to output this cycle
I0320 18:07:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 18:07:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:07:43.409794 543705 memory.go:191] Add success.
I0320 18:07:43.409823 543705 cpu.go:282] Add success.
I0320 18:07:43.419723 543705 net.go:648] Add success.
I0320 18:07:43.422259 543705 net.go:770] primary dev: ETH0
I0320 18:07:43.422273 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:07:43.422283 543705 net.go:698] Add success.
I0320 18:07:46.458012 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:07:46.458088 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:07:46.458119 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:07:53.410528 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:07:53.410546 543705 memory.go:184] no items to output this cycle
I0320 18:07:53.410571 543705 cpu.go:275] no items to output this cycle
E0320 18:08:03.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:08:03.409817 543705 memory.go:184] no items to output this cycle
I0320 18:08:03.409828 543705 cpu.go:275] no items to output this cycle
E0320 18:08:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:08:13.409805 543705 memory.go:191] Add success.
I0320 18:08:13.409805 543705 cpu.go:282] Add success.
W0320 18:08:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:08:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:08:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:08:13.420519 543705 net.go:648] Add success.
I0320 18:08:13.423082 543705 net.go:770] primary dev: ETH0
I0320 18:08:13.423097 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:08:13.423110 543705 net.go:698] Add success.
I0320 18:08:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:08:14.455096 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:08:14.455159 543705 disk_worker.go:708] disk space is not compliant
W0320 18:08:14.455162 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:08:14.456480 543705 disk_worker.go:494] system disk:vda1
I0320 18:08:14.456525 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:08:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:08:16.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:08:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:08:16.458076 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:08:16.472421 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:08:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:08:23.409787 543705 cpu.go:275] no items to output this cycle
I0320 18:08:23.409789 543705 memory.go:184] no items to output this cycle
I0320 18:08:25.521243 543705 disk_info.go:125] begin check local disk info of client
I0320 18:08:25.523714 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:08:25.523720 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329980 0xc0003299c0]
E0320 18:08:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:08:33.409806 543705 memory.go:184] no items to output this cycle
I0320 18:08:33.409821 543705 cpu.go:275] no items to output this cycle
E0320 18:08:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:08:43.409791 543705 memory.go:191] Add success.
I0320 18:08:43.409816 543705 cpu.go:282] Add success.
I0320 18:08:43.419741 543705 net.go:648] Add success.
I0320 18:08:43.422297 543705 net.go:770] primary dev: ETH0
I0320 18:08:43.422312 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:08:43.422325 543705 net.go:698] Add success.
I0320 18:08:46.458016 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:08:46.458103 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:08:46.458136 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:08:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:08:53.409782 543705 cpu.go:275] no items to output this cycle
I0320 18:08:53.409798 543705 memory.go:184] no items to output this cycle
E0320 18:09:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:09:03.409797 543705 memory.go:184] no items to output this cycle
I0320 18:09:03.409828 543705 cpu.go:275] no items to output this cycle
E0320 18:09:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:09:13.409819 543705 memory.go:191] Add success.
I0320 18:09:13.409840 543705 cpu.go:282] Add success.
W0320 18:09:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:09:13.409876 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:09:13.409880 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:09:13.420108 543705 net.go:648] Add success.
I0320 18:09:13.423118 543705 net.go:770] primary dev: ETH0
I0320 18:09:13.423134 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:09:13.423149 543705 net.go:698] Add success.
I0320 18:09:13.514788 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"821aa077-522f-4537-891b-befb3e6c0587","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 18:09:13.514823 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 18:09:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:09:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:09:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 18:09:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:09:14.456654 543705 disk_worker.go:494] system disk:vda1
I0320 18:09:14.456684 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:09:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:09:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:09:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:09:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:09:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:09:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:09:23.409777 543705 memory.go:184] no items to output this cycle
I0320 18:09:23.409778 543705 cpu.go:275] no items to output this cycle
I0320 18:09:25.523799 543705 disk_info.go:125] begin check local disk info of client
I0320 18:09:25.526226 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:09:25.526233 543705 disk_info.go:196] parse disk info done, disk is : [0xc000396800 0xc000396840]
E0320 18:09:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:09:33.409808 543705 memory.go:184] no items to output this cycle
I0320 18:09:33.409817 543705 cpu.go:275] no items to output this cycle
I0320 18:09:38.580008 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 18:09:38.580015 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 18:09:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:09:43.410631 543705 memory.go:191] Add success.
I0320 18:09:43.409816 543705 cpu.go:282] Add success.
I0320 18:09:43.420339 543705 net.go:648] Add success.
I0320 18:09:43.423182 543705 net.go:770] primary dev: ETH0
I0320 18:09:43.423197 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:09:43.423211 543705 net.go:698] Add success.
I0320 18:09:46.457993 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:09:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:09:46.458091 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:09:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:09:53.409779 543705 memory.go:184] no items to output this cycle
I0320 18:09:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 18:10:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:10:03.409812 543705 memory.go:184] no items to output this cycle
I0320 18:10:03.409822 543705 cpu.go:275] no items to output this cycle
E0320 18:10:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:10:13.409781 543705 memory.go:191] Add success.
W0320 18:10:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 18:10:13.409809 543705 cpu.go:282] Add success.
W0320 18:10:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:10:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:10:13.420125 543705 net.go:648] Add success.
I0320 18:10:13.422793 543705 net.go:770] primary dev: ETH0
I0320 18:10:13.422809 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:10:13.422824 543705 net.go:698] Add success.
I0320 18:10:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:10:14.455204 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:10:14.455215 543705 disk_worker.go:708] disk space is not compliant
W0320 18:10:14.455218 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:10:14.456599 543705 disk_worker.go:494] system disk:vda1
I0320 18:10:14.456629 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:10:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:10:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:10:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:10:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:10:16.472396 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:10:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:10:23.409772 543705 memory.go:184] no items to output this cycle
I0320 18:10:23.409777 543705 cpu.go:275] no items to output this cycle
I0320 18:10:25.526270 543705 disk_info.go:125] begin check local disk info of client
I0320 18:10:25.528683 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:10:25.528688 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003db640 0xc0003db680]
E0320 18:10:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:10:33.409803 543705 memory.go:184] no items to output this cycle
I0320 18:10:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 18:10:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:10:43.409774 543705 memory.go:191] Add success.
I0320 18:10:43.409806 543705 cpu.go:282] Add success.
I0320 18:10:43.419842 543705 net.go:648] Add success.
I0320 18:10:43.422710 543705 net.go:770] primary dev: ETH0
I0320 18:10:43.422726 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:10:43.422741 543705 net.go:698] Add success.
I0320 18:10:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:10:46.458066 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:10:46.458095 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:10:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:10:53.409815 543705 memory.go:184] no items to output this cycle
I0320 18:10:53.409821 543705 cpu.go:275] no items to output this cycle
E0320 18:11:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:11:03.409809 543705 memory.go:184] no items to output this cycle
I0320 18:11:03.409822 543705 cpu.go:275] no items to output this cycle
E0320 18:11:13.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:11:13.409780 543705 memory.go:191] Add success.
I0320 18:11:13.409796 543705 cpu.go:282] Add success.
W0320 18:11:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:11:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:11:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:11:13.420084 543705 net.go:648] Add success.
I0320 18:11:13.423153 543705 net.go:770] primary dev: ETH0
I0320 18:11:13.423167 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:11:13.423179 543705 net.go:698] Add success.
I0320 18:11:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:11:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:11:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 18:11:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:11:14.456578 543705 disk_worker.go:494] system disk:vda1
I0320 18:11:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:11:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:11:16.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:11:16.458065 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:11:16.458091 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:11:16.472430 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:11:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:11:23.409773 543705 memory.go:184] no items to output this cycle
I0320 18:11:23.409786 543705 cpu.go:275] no items to output this cycle
I0320 18:11:25.529287 543705 disk_info.go:125] begin check local disk info of client
I0320 18:11:25.531909 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:11:25.531916 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a5980 0xc0004a59c0]
E0320 18:11:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:11:33.409778 543705 memory.go:184] no items to output this cycle
I0320 18:11:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 18:11:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:11:43.409816 543705 memory.go:191] Add success.
I0320 18:11:43.409825 543705 cpu.go:282] Add success.
I0320 18:11:43.420159 543705 net.go:648] Add success.
I0320 18:11:43.422974 543705 net.go:770] primary dev: ETH0
I0320 18:11:43.422994 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:11:43.423010 543705 net.go:698] Add success.
I0320 18:11:46.458035 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:11:46.458123 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:11:46.458162 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:11:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:11:53.409789 543705 memory.go:184] no items to output this cycle
I0320 18:11:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 18:12:03.409898 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:12:03.409918 543705 memory.go:184] no items to output this cycle
I0320 18:12:03.409989 543705 cpu.go:275] no items to output this cycle
E0320 18:12:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:12:13.409809 543705 memory.go:191] Add success.
I0320 18:12:13.409813 543705 cpu.go:282] Add success.
W0320 18:12:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:12:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:12:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:12:13.420068 543705 net.go:648] Add success.
I0320 18:12:13.423163 543705 net.go:770] primary dev: ETH0
I0320 18:12:13.423176 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:12:13.423190 543705 net.go:698] Add success.
I0320 18:12:13.463982 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0f866a0f-eaf0-43bb-909f-24ab4283998f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 18:12:13.464013 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 18:12:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:12:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0320 18:12:14.455204 543705 disk_worker.go:728] disk inode is not compliant
E0320 18:12:14.456810 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 18:12:14.456819 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 18:12:14.456825 543705 custom_config.go:64] query custom config with name: gpu
I0320 18:12:14.456834 543705 disk_worker.go:494] system disk:vda1
I0320 18:12:14.456863 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 18:12:15.456822 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 18:12:15.456831 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:12:16.457954 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 18:12:16.457963 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 18:12:16.458007 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:12:16.458026 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:12:16.472343 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:12:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:12:23.409795 543705 memory.go:184] no items to output this cycle
I0320 18:12:23.409806 543705 cpu.go:275] no items to output this cycle
I0320 18:12:25.532299 543705 disk_info.go:125] begin check local disk info of client
I0320 18:12:25.534720 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:12:25.534728 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048aac0 0xc00048ab00]
E0320 18:12:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:12:33.409811 543705 memory.go:184] no items to output this cycle
I0320 18:12:33.409824 543705 cpu.go:275] no items to output this cycle
I0320 18:12:38.580606 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 18:12:38.580613 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 18:12:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:12:43.410563 543705 memory.go:191] Add success.
I0320 18:12:43.409791 543705 cpu.go:282] Add success.
I0320 18:12:43.420330 543705 net.go:648] Add success.
I0320 18:12:43.422949 543705 net.go:770] primary dev: ETH0
I0320 18:12:43.422962 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:12:43.422974 543705 net.go:698] Add success.
I0320 18:12:46.458022 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:12:46.458103 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:12:46.458138 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:12:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:12:53.409779 543705 memory.go:184] no items to output this cycle
I0320 18:12:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 18:13:03.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:13:03.409818 543705 memory.go:184] no items to output this cycle
I0320 18:13:03.409831 543705 cpu.go:275] no items to output this cycle
E0320 18:13:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:13:13.409792 543705 memory.go:191] Add success.
I0320 18:13:13.409804 543705 cpu.go:282] Add success.
W0320 18:13:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:13:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:13:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:13:13.420119 543705 net.go:648] Add success.
I0320 18:13:13.422708 543705 net.go:770] primary dev: ETH0
I0320 18:13:13.422721 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:13:13.422733 543705 net.go:698] Add success.
I0320 18:13:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:13:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:13:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0320 18:13:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:13:14.456567 543705 disk_worker.go:494] system disk:vda1
I0320 18:13:14.456596 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:13:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:13:16.458024 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:13:16.458088 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:13:16.458113 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:13:16.472436 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:13:23.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:13:23.409761 543705 memory.go:184] no items to output this cycle
I0320 18:13:23.409797 543705 cpu.go:275] no items to output this cycle
I0320 18:13:25.534807 543705 disk_info.go:125] begin check local disk info of client
I0320 18:13:25.537270 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:13:25.537287 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa900 0xc0001aa940]
E0320 18:13:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:13:33.409781 543705 memory.go:184] no items to output this cycle
I0320 18:13:33.409806 543705 cpu.go:275] no items to output this cycle
E0320 18:13:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:13:43.409790 543705 memory.go:191] Add success.
I0320 18:13:43.409791 543705 cpu.go:282] Add success.
I0320 18:13:43.419956 543705 net.go:648] Add success.
I0320 18:13:43.422465 543705 net.go:770] primary dev: ETH0
I0320 18:13:43.422479 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:13:43.422492 543705 net.go:698] Add success.
I0320 18:13:46.457608 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:13:46.457686 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:13:46.457716 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:13:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:13:53.409783 543705 memory.go:184] no items to output this cycle
I0320 18:13:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 18:14:03.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:14:03.409810 543705 memory.go:184] no items to output this cycle
I0320 18:14:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 18:14:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:14:13.409815 543705 memory.go:191] Add success.
I0320 18:14:13.409821 543705 cpu.go:282] Add success.
W0320 18:14:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:14:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:14:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:14:13.420131 543705 net.go:648] Add success.
I0320 18:14:13.423171 543705 net.go:770] primary dev: ETH0
I0320 18:14:13.423187 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:14:13.423201 543705 net.go:698] Add success.
I0320 18:14:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:14:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:14:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0320 18:14:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:14:14.456572 543705 disk_worker.go:494] system disk:vda1
I0320 18:14:14.456602 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:14:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:14:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:14:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:14:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:14:16.472382 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:14:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:14:23.409792 543705 memory.go:184] no items to output this cycle
I0320 18:14:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 18:14:25.537325 543705 disk_info.go:125] begin check local disk info of client
I0320 18:14:25.539893 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:14:25.539899 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003f48c0 0xc0003f4900]
E0320 18:14:33.410016 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:14:33.410033 543705 memory.go:184] no items to output this cycle
I0320 18:14:33.410034 543705 cpu.go:275] no items to output this cycle
E0320 18:14:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:14:43.409794 543705 memory.go:191] Add success.
I0320 18:14:43.409794 543705 cpu.go:282] Add success.
I0320 18:14:43.419978 543705 net.go:648] Add success.
I0320 18:14:43.422807 543705 net.go:770] primary dev: ETH0
I0320 18:14:43.422820 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:14:43.422838 543705 net.go:698] Add success.
I0320 18:14:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:14:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:14:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:14:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:14:53.409782 543705 cpu.go:275] no items to output this cycle
I0320 18:14:53.409800 543705 memory.go:184] no items to output this cycle
E0320 18:15:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:15:03.409788 543705 memory.go:184] no items to output this cycle
I0320 18:15:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 18:15:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:15:13.409817 543705 memory.go:191] Add success.
I0320 18:15:13.409825 543705 cpu.go:282] Add success.
W0320 18:15:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:15:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:15:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:15:13.420139 543705 net.go:648] Add success.
I0320 18:15:13.422924 543705 net.go:770] primary dev: ETH0
I0320 18:15:13.422939 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:15:13.422952 543705 net.go:698] Add success.
I0320 18:15:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:15:14.455145 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:15:14.455155 543705 disk_worker.go:708] disk space is not compliant
W0320 18:15:14.455158 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:15:14.456501 543705 disk_worker.go:494] system disk:vda1
I0320 18:15:14.456547 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:15:14.552997 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9a01fe42-f3a8-485a-97ab-019a8be009a5","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 18:15:14.553030 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 18:15:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:15:16.457963 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:15:16.458025 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:15:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:15:16.472377 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:15:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:15:23.409770 543705 memory.go:184] no items to output this cycle
I0320 18:15:23.409799 543705 cpu.go:275] no items to output this cycle
I0320 18:15:25.540346 543705 disk_info.go:125] begin check local disk info of client
I0320 18:15:25.542806 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:15:25.542811 543705 disk_info.go:196] parse disk info done, disk is : [0xc000470780 0xc0004707c0]
E0320 18:15:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:15:33.409776 543705 memory.go:184] no items to output this cycle
I0320 18:15:33.409794 543705 cpu.go:275] no items to output this cycle
I0320 18:15:38.581673 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 18:15:38.581680 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 18:15:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:15:43.410746 543705 memory.go:191] Add success.
I0320 18:15:43.409819 543705 cpu.go:282] Add success.
I0320 18:15:43.420435 543705 net.go:648] Add success.
I0320 18:15:43.423690 543705 net.go:770] primary dev: ETH0
I0320 18:15:43.423703 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:15:43.423715 543705 net.go:698] Add success.
I0320 18:15:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:15:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:15:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:15:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:15:53.409788 543705 memory.go:184] no items to output this cycle
I0320 18:15:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 18:16:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:16:03.409776 543705 memory.go:184] no items to output this cycle
I0320 18:16:03.409806 543705 cpu.go:275] no items to output this cycle
E0320 18:16:13.409861 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:16:13.409922 543705 memory.go:191] Add success.
W0320 18:16:13.409953 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 18:16:13.409964 543705 cpu.go:282] Add success.
W0320 18:16:13.409971 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:16:13.409974 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:16:13.419748 543705 net.go:648] Add success.
I0320 18:16:13.422561 543705 net.go:770] primary dev: ETH0
I0320 18:16:13.422583 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:16:13.422596 543705 net.go:698] Add success.
I0320 18:16:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:16:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:16:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0320 18:16:14.455170 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:16:14.456554 543705 disk_worker.go:494] system disk:vda1
I0320 18:16:14.456595 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:16:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:16:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:16:16.458025 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:16:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:16:16.472390 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:16:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:16:23.409771 543705 memory.go:184] no items to output this cycle
I0320 18:16:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 18:16:25.543360 543705 disk_info.go:125] begin check local disk info of client
I0320 18:16:25.545816 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:16:25.545822 543705 disk_info.go:196] parse disk info done, disk is : [0xc000470440 0xc000470480]
E0320 18:16:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:16:33.409780 543705 memory.go:184] no items to output this cycle
I0320 18:16:33.409805 543705 cpu.go:275] no items to output this cycle
E0320 18:16:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:16:43.409786 543705 memory.go:191] Add success.
I0320 18:16:43.409805 543705 cpu.go:282] Add success.
I0320 18:16:43.419844 543705 net.go:648] Add success.
I0320 18:16:43.422723 543705 net.go:770] primary dev: ETH0
I0320 18:16:43.422735 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:16:43.422747 543705 net.go:698] Add success.
I0320 18:16:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:16:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:16:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:16:53.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:16:53.409816 543705 memory.go:184] no items to output this cycle
I0320 18:16:53.409823 543705 cpu.go:275] no items to output this cycle
E0320 18:17:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:17:03.409791 543705 memory.go:184] no items to output this cycle
I0320 18:17:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 18:17:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:17:13.409794 543705 memory.go:191] Add success.
I0320 18:17:13.409794 543705 cpu.go:282] Add success.
W0320 18:17:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:17:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:17:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:17:13.420356 543705 net.go:648] Add success.
I0320 18:17:13.422980 543705 net.go:770] primary dev: ETH0
I0320 18:17:13.422993 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:17:13.423004 543705 net.go:698] Add success.
I0320 18:17:13.452768 543705 event_worker.go:152] Polling the log file for events...
W0320 18:17:14.455100 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:17:14.455157 543705 disk_worker.go:708] disk space is not compliant
W0320 18:17:14.455160 543705 disk_worker.go:728] disk inode is not compliant
E0320 18:17:14.456923 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 18:17:14.456931 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 18:17:14.456937 543705 custom_config.go:64] query custom config with name: gpu
I0320 18:17:14.456985 543705 disk_worker.go:494] system disk:vda1
I0320 18:17:14.457025 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 18:17:15.456784 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 18:17:15.456793 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:17:16.457924 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 18:17:16.457923 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 18:17:16.457977 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:17:16.457997 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:17:16.472329 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:17:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:17:23.409788 543705 memory.go:184] no items to output this cycle
I0320 18:17:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 18:17:25.545904 543705 disk_info.go:125] begin check local disk info of client
I0320 18:17:25.548354 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:17:25.548361 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab780 0xc0001ab7c0]
E0320 18:17:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:17:33.409800 543705 memory.go:184] no items to output this cycle
I0320 18:17:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 18:17:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:17:43.409783 543705 memory.go:191] Add success.
I0320 18:17:43.409797 543705 cpu.go:282] Add success.
I0320 18:17:43.419899 543705 net.go:648] Add success.
I0320 18:17:43.422602 543705 net.go:770] primary dev: ETH0
I0320 18:17:43.422615 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:17:43.422627 543705 net.go:698] Add success.
I0320 18:17:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:17:46.458069 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:17:46.458094 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:17:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:17:53.409788 543705 memory.go:184] no items to output this cycle
I0320 18:17:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 18:18:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:18:03.409781 543705 memory.go:184] no items to output this cycle
I0320 18:18:03.409790 543705 cpu.go:275] no items to output this cycle
E0320 18:18:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:18:13.409805 543705 memory.go:191] Add success.
I0320 18:18:13.409810 543705 cpu.go:282] Add success.
W0320 18:18:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:18:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:18:13.409856 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:18:13.419737 543705 net.go:648] Add success.
I0320 18:18:13.422999 543705 net.go:770] primary dev: ETH0
I0320 18:18:13.423012 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:18:13.423024 543705 net.go:698] Add success.
I0320 18:18:13.469518 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4a496fef-0c23-41c3-aa28-ab845115cdd3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 18:18:13.469549 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 18:18:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:18:14.455139 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:18:14.455149 543705 disk_worker.go:708] disk space is not compliant
W0320 18:18:14.455152 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:18:14.456483 543705 disk_worker.go:494] system disk:vda1
I0320 18:18:14.456525 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:18:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:18:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:18:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:18:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:18:16.472377 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:18:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:18:23.409773 543705 cpu.go:275] no items to output this cycle
I0320 18:18:23.409774 543705 memory.go:184] no items to output this cycle
I0320 18:18:25.548389 543705 disk_info.go:125] begin check local disk info of client
I0320 18:18:25.550857 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:18:25.550863 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048ac00 0xc00048ac40]
E0320 18:18:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:18:33.409798 543705 memory.go:184] no items to output this cycle
I0320 18:18:33.409814 543705 cpu.go:275] no items to output this cycle
I0320 18:18:38.582621 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 18:18:38.582628 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 18:18:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:18:43.410621 543705 memory.go:191] Add success.
I0320 18:18:43.409782 543705 cpu.go:282] Add success.
I0320 18:18:43.420311 543705 net.go:648] Add success.
I0320 18:18:43.422805 543705 net.go:770] primary dev: ETH0
I0320 18:18:43.422819 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:18:43.422831 543705 net.go:698] Add success.
I0320 18:18:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:18:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:18:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:18:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:18:53.409776 543705 memory.go:184] no items to output this cycle
I0320 18:18:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 18:19:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:19:03.409802 543705 memory.go:184] no items to output this cycle
I0320 18:19:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 18:19:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:19:13.409778 543705 memory.go:191] Add success.
I0320 18:19:13.409796 543705 cpu.go:282] Add success.
W0320 18:19:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:19:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:19:13.409819 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:19:13.420224 543705 net.go:648] Add success.
I0320 18:19:13.422868 543705 net.go:770] primary dev: ETH0
I0320 18:19:13.422883 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:19:13.422896 543705 net.go:698] Add success.
I0320 18:19:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:19:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:19:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 18:19:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:19:14.456570 543705 disk_worker.go:494] system disk:vda1
I0320 18:19:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:19:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:19:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:19:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:19:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:19:16.472385 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:19:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:19:23.409804 543705 memory.go:184] no items to output this cycle
I0320 18:19:23.409815 543705 cpu.go:275] no items to output this cycle
I0320 18:19:25.551397 543705 disk_info.go:125] begin check local disk info of client
I0320 18:19:25.553864 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:19:25.553870 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048a840 0xc00048a880]
E0320 18:19:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:19:33.409768 543705 memory.go:184] no items to output this cycle
I0320 18:19:33.409800 543705 cpu.go:275] no items to output this cycle
E0320 18:19:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:19:43.409788 543705 memory.go:191] Add success.
I0320 18:19:43.409804 543705 cpu.go:282] Add success.
I0320 18:19:43.419905 543705 net.go:648] Add success.
I0320 18:19:43.423005 543705 net.go:770] primary dev: ETH0
I0320 18:19:43.423019 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:19:43.423032 543705 net.go:698] Add success.
I0320 18:19:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:19:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:19:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:19:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:19:53.409783 543705 memory.go:184] no items to output this cycle
I0320 18:19:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 18:20:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:20:03.409785 543705 memory.go:184] no items to output this cycle
I0320 18:20:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 18:20:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:20:13.409788 543705 cpu.go:282] Add success.
I0320 18:20:13.409794 543705 memory.go:191] Add success.
W0320 18:20:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:20:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:20:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:20:13.420203 543705 net.go:648] Add success.
I0320 18:20:13.423259 543705 net.go:770] primary dev: ETH0
I0320 18:20:13.423273 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:20:13.423284 543705 net.go:698] Add success.
I0320 18:20:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:20:14.455188 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:20:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0320 18:20:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:20:14.456590 543705 disk_worker.go:494] system disk:vda1
I0320 18:20:14.456617 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:20:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:20:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:20:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:20:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:20:16.472386 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:20:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:20:23.409800 543705 memory.go:184] no items to output this cycle
I0320 18:20:23.409813 543705 cpu.go:275] no items to output this cycle
I0320 18:20:25.554414 543705 disk_info.go:125] begin check local disk info of client
I0320 18:20:25.556872 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:20:25.556878 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007abc0 0xc00007ac00]
E0320 18:20:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:20:33.409769 543705 memory.go:184] no items to output this cycle
I0320 18:20:33.409794 543705 cpu.go:275] no items to output this cycle
E0320 18:20:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:20:43.409778 543705 memory.go:191] Add success.
I0320 18:20:43.409795 543705 cpu.go:282] Add success.
I0320 18:20:43.419885 543705 net.go:648] Add success.
I0320 18:20:43.422653 543705 net.go:770] primary dev: ETH0
I0320 18:20:43.422666 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:20:43.422680 543705 net.go:698] Add success.
I0320 18:20:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:20:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:20:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:20:53.410344 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:20:53.410347 543705 cpu.go:275] no items to output this cycle
I0320 18:20:53.410361 543705 memory.go:184] no items to output this cycle
E0320 18:21:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:21:03.409785 543705 memory.go:184] no items to output this cycle
I0320 18:21:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 18:21:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:21:13.409788 543705 memory.go:191] Add success.
I0320 18:21:13.409788 543705 cpu.go:282] Add success.
W0320 18:21:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:21:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:21:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:21:13.420159 543705 net.go:648] Add success.
I0320 18:21:13.423450 543705 net.go:770] primary dev: ETH0
I0320 18:21:13.423463 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:21:13.423475 543705 net.go:698] Add success.
I0320 18:21:13.480747 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0d902f15-6cbf-4e29-8fce-26d62e9894a3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 18:21:13.480781 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 18:21:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:21:14.455166 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:21:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0320 18:21:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:21:14.456674 543705 disk_worker.go:494] system disk:vda1
I0320 18:21:14.456706 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:21:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:21:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:21:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:21:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:21:16.472403 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:21:23.410280 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:21:23.410300 543705 memory.go:184] no items to output this cycle
I0320 18:21:23.410301 543705 cpu.go:275] no items to output this cycle
I0320 18:21:25.557425 543705 disk_info.go:125] begin check local disk info of client
I0320 18:21:25.559947 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:21:25.559953 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aaec0 0xc0001aaf00]
E0320 18:21:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:21:33.409781 543705 memory.go:184] no items to output this cycle
I0320 18:21:33.409782 543705 cpu.go:275] no items to output this cycle
I0320 18:21:38.583671 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 18:21:38.583678 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 18:21:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:21:43.410794 543705 memory.go:191] Add success.
I0320 18:21:43.409819 543705 cpu.go:282] Add success.
I0320 18:21:43.420543 543705 net.go:648] Add success.
I0320 18:21:43.423082 543705 net.go:770] primary dev: ETH0
I0320 18:21:43.423097 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:21:43.423112 543705 net.go:698] Add success.
I0320 18:21:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:21:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:21:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:21:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:21:53.409789 543705 memory.go:184] no items to output this cycle
I0320 18:21:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 18:22:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:22:03.409782 543705 memory.go:184] no items to output this cycle
I0320 18:22:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 18:22:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:22:13.409789 543705 memory.go:191] Add success.
I0320 18:22:13.409790 543705 cpu.go:282] Add success.
W0320 18:22:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:22:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:22:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:22:13.420183 543705 net.go:648] Add success.
I0320 18:22:13.423637 543705 net.go:770] primary dev: ETH0
I0320 18:22:13.423649 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:22:13.423660 543705 net.go:698] Add success.
W0320 18:22:14.455151 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:22:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0320 18:22:14.455168 543705 disk_worker.go:728] disk inode is not compliant
E0320 18:22:14.456908 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 18:22:14.456918 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 18:22:14.456923 543705 custom_config.go:64] query custom config with name: gpu
I0320 18:22:14.456991 543705 disk_worker.go:494] system disk:vda1
I0320 18:22:14.457032 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 18:22:15.456810 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 18:22:15.456817 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:22:16.457912 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 18:22:16.457912 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 18:22:16.457968 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:22:16.457987 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:22:16.472364 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:22:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:22:23.409805 543705 memory.go:184] no items to output this cycle
I0320 18:22:23.409813 543705 cpu.go:275] no items to output this cycle
I0320 18:22:25.560433 543705 disk_info.go:125] begin check local disk info of client
I0320 18:22:25.562919 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:22:25.562925 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c47c0 0xc0000c4800]
E0320 18:22:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:22:33.409768 543705 memory.go:184] no items to output this cycle
I0320 18:22:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 18:22:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:22:43.409792 543705 memory.go:191] Add success.
I0320 18:22:43.409794 543705 cpu.go:282] Add success.
I0320 18:22:43.419854 543705 net.go:648] Add success.
I0320 18:22:43.422767 543705 net.go:770] primary dev: ETH0
I0320 18:22:43.422780 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:22:43.422792 543705 net.go:698] Add success.
I0320 18:22:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:22:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:22:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:22:53.410243 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:22:53.410259 543705 cpu.go:275] no items to output this cycle
I0320 18:22:53.410261 543705 memory.go:184] no items to output this cycle
E0320 18:23:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:23:03.409791 543705 cpu.go:275] no items to output this cycle
I0320 18:23:03.409794 543705 memory.go:184] no items to output this cycle
E0320 18:23:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:23:13.409779 543705 memory.go:191] Add success.
W0320 18:23:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 18:23:13.409812 543705 cpu.go:282] Add success.
W0320 18:23:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:23:13.409819 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:23:13.420337 543705 net.go:648] Add success.
I0320 18:23:13.422904 543705 net.go:770] primary dev: ETH0
I0320 18:23:13.422917 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:23:13.422928 543705 net.go:698] Add success.
I0320 18:23:14.454952 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:23:14.455102 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:23:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0320 18:23:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:23:14.456567 543705 disk_worker.go:494] system disk:vda1
I0320 18:23:14.456596 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:23:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:23:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:23:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:23:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:23:16.472407 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:23:23.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:23:23.409806 543705 memory.go:184] no items to output this cycle
I0320 18:23:23.409815 543705 cpu.go:275] no items to output this cycle
I0320 18:23:25.563456 543705 disk_info.go:125] begin check local disk info of client
I0320 18:23:25.565987 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:23:25.565993 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003fe0c0 0xc0003fe100]
E0320 18:23:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:23:33.409771 543705 memory.go:184] no items to output this cycle
I0320 18:23:33.409779 543705 cpu.go:275] no items to output this cycle
E0320 18:23:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:23:43.409813 543705 memory.go:191] Add success.
I0320 18:23:43.409816 543705 cpu.go:282] Add success.
I0320 18:23:43.419925 543705 net.go:648] Add success.
I0320 18:23:43.422591 543705 net.go:770] primary dev: ETH0
I0320 18:23:43.422604 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:23:43.422617 543705 net.go:698] Add success.
I0320 18:23:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:23:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:23:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:23:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:23:53.409785 543705 memory.go:184] no items to output this cycle
I0320 18:23:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 18:24:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:24:03.409777 543705 memory.go:184] no items to output this cycle
I0320 18:24:03.409826 543705 cpu.go:275] no items to output this cycle
E0320 18:24:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:24:13.409779 543705 memory.go:191] Add success.
W0320 18:24:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 18:24:13.409811 543705 cpu.go:282] Add success.
W0320 18:24:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:24:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:24:13.420396 543705 net.go:648] Add success.
I0320 18:24:13.423172 543705 net.go:770] primary dev: ETH0
I0320 18:24:13.423185 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:24:13.423196 543705 net.go:698] Add success.
I0320 18:24:13.463231 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8d95c754-579e-4089-a4cd-7e2242bf75bb","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 18:24:13.463262 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 18:24:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:24:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:24:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 18:24:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:24:14.456673 543705 disk_worker.go:494] system disk:vda1
I0320 18:24:14.456701 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:24:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:24:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:24:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:24:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:24:16.472365 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:24:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:24:23.409804 543705 memory.go:184] no items to output this cycle
I0320 18:24:23.409817 543705 cpu.go:275] no items to output this cycle
I0320 18:24:25.566076 543705 disk_info.go:125] begin check local disk info of client
I0320 18:24:25.568523 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:24:25.568528 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5ac0 0xc0000c5b00]
E0320 18:24:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:24:33.409773 543705 memory.go:184] no items to output this cycle
I0320 18:24:33.409783 543705 cpu.go:275] no items to output this cycle
I0320 18:24:38.583819 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 18:24:38.583825 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 18:24:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:24:43.410692 543705 memory.go:191] Add success.
I0320 18:24:43.409827 543705 cpu.go:282] Add success.
I0320 18:24:43.420423 543705 net.go:648] Add success.
I0320 18:24:43.423134 543705 net.go:770] primary dev: ETH0
I0320 18:24:43.423147 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:24:43.423159 543705 net.go:698] Add success.
I0320 18:24:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:24:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:24:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:24:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:24:53.409787 543705 memory.go:184] no items to output this cycle
I0320 18:24:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 18:25:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:25:03.409804 543705 memory.go:184] no items to output this cycle
I0320 18:25:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 18:25:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:25:13.409794 543705 memory.go:191] Add success.
I0320 18:25:13.409816 543705 cpu.go:282] Add success.
W0320 18:25:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:25:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:25:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:25:13.420084 543705 net.go:648] Add success.
I0320 18:25:13.422916 543705 net.go:770] primary dev: ETH0
I0320 18:25:13.422936 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:25:13.422953 543705 net.go:698] Add success.
I0320 18:25:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:25:14.455244 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:25:14.455316 543705 disk_worker.go:708] disk space is not compliant
W0320 18:25:14.455323 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:25:14.457492 543705 disk_worker.go:494] system disk:vda1
I0320 18:25:14.457534 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:25:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:25:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:25:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:25:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:25:16.472381 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:25:23.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:25:23.409818 543705 memory.go:184] no items to output this cycle
I0320 18:25:23.409830 543705 cpu.go:275] no items to output this cycle
I0320 18:25:25.569527 543705 disk_info.go:125] begin check local disk info of client
I0320 18:25:25.572392 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:25:25.572399 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007ac00 0xc00007ac40]
E0320 18:25:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:25:33.409774 543705 memory.go:184] no items to output this cycle
I0320 18:25:33.409800 543705 cpu.go:275] no items to output this cycle
E0320 18:25:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:25:43.409802 543705 memory.go:191] Add success.
I0320 18:25:43.409805 543705 cpu.go:282] Add success.
I0320 18:25:43.419990 543705 net.go:648] Add success.
I0320 18:25:43.422697 543705 net.go:770] primary dev: ETH0
I0320 18:25:43.422712 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:25:43.422726 543705 net.go:698] Add success.
I0320 18:25:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:25:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:25:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:25:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:25:53.409797 543705 cpu.go:275] no items to output this cycle
I0320 18:25:53.409805 543705 memory.go:184] no items to output this cycle
E0320 18:26:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:26:03.409784 543705 memory.go:184] no items to output this cycle
I0320 18:26:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 18:26:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:26:13.409820 543705 memory.go:191] Add success.
I0320 18:26:13.409830 543705 cpu.go:282] Add success.
W0320 18:26:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:26:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:26:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:26:13.420123 543705 net.go:648] Add success.
I0320 18:26:13.422859 543705 net.go:770] primary dev: ETH0
I0320 18:26:13.422871 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:26:13.422884 543705 net.go:698] Add success.
I0320 18:26:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:26:14.455183 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:26:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 18:26:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:26:14.456828 543705 disk_worker.go:494] system disk:vda1
I0320 18:26:14.456859 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:26:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:26:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:26:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:26:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:26:16.472428 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:26:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:26:23.409782 543705 memory.go:184] no items to output this cycle
I0320 18:26:23.409799 543705 cpu.go:275] no items to output this cycle
I0320 18:26:25.572485 543705 disk_info.go:125] begin check local disk info of client
I0320 18:26:25.575117 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:26:25.575123 543705 disk_info.go:196] parse disk info done, disk is : [0xc00025c080 0xc00025c0c0]
E0320 18:26:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:26:33.409805 543705 memory.go:184] no items to output this cycle
I0320 18:26:33.409814 543705 cpu.go:275] no items to output this cycle
E0320 18:26:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:26:43.409787 543705 memory.go:191] Add success.
I0320 18:26:43.409786 543705 cpu.go:282] Add success.
I0320 18:26:43.419876 543705 net.go:648] Add success.
I0320 18:26:43.422868 543705 net.go:770] primary dev: ETH0
I0320 18:26:43.422880 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:26:43.422893 543705 net.go:698] Add success.
I0320 18:26:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:26:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:26:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:26:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:26:53.409789 543705 memory.go:184] no items to output this cycle
I0320 18:26:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 18:27:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:27:03.409777 543705 memory.go:184] no items to output this cycle
I0320 18:27:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 18:27:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:27:13.409808 543705 memory.go:191] Add success.
I0320 18:27:13.409815 543705 cpu.go:282] Add success.
W0320 18:27:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:27:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:27:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:27:13.420133 543705 net.go:648] Add success.
I0320 18:27:13.422985 543705 net.go:770] primary dev: ETH0
I0320 18:27:13.422998 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:27:13.423011 543705 net.go:698] Add success.
I0320 18:27:13.429241 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 18:27:13.453414 543705 event_worker.go:152] Polling the log file for events...
I0320 18:27:13.464295 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7dfcafc1-e6fc-414c-91b5-17abccf522f1","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 18:27:13.464337 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 18:27:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:27:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0320 18:27:14.455176 543705 disk_worker.go:728] disk inode is not compliant
E0320 18:27:14.456117 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 18:27:14.456138 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 18:27:14.456144 543705 custom_config.go:64] query custom config with name: gpu
I0320 18:27:14.456428 543705 disk_worker.go:494] system disk:vda1
I0320 18:27:14.456459 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 18:27:15.456791 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 18:27:15.456798 543705 custom_config.go:64] query custom config with name: huawei_npu
E0320 18:27:16.458092 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 18:27:16.458097 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:27:16.458154 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:27:16.458172 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:27:16.472534 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:27:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:27:23.409777 543705 memory.go:184] no items to output this cycle
I0320 18:27:23.409780 543705 cpu.go:275] no items to output this cycle
I0320 18:27:25.575509 543705 disk_info.go:125] begin check local disk info of client
I0320 18:27:25.578015 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:27:25.578020 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fbf00 0xc0001fbf40]
E0320 18:27:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:27:33.409797 543705 memory.go:184] no items to output this cycle
I0320 18:27:33.409811 543705 cpu.go:275] no items to output this cycle
I0320 18:27:38.584616 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 18:27:38.584623 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 18:27:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:27:43.410689 543705 memory.go:191] Add success.
I0320 18:27:43.409784 543705 cpu.go:282] Add success.
I0320 18:27:43.420377 543705 net.go:648] Add success.
I0320 18:27:43.423345 543705 net.go:770] primary dev: ETH0
I0320 18:27:43.423359 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:27:43.423378 543705 net.go:698] Add success.
I0320 18:27:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:27:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:27:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:27:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:27:53.409776 543705 memory.go:184] no items to output this cycle
I0320 18:27:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 18:28:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:28:03.409788 543705 memory.go:184] no items to output this cycle
I0320 18:28:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 18:28:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:28:13.409811 543705 memory.go:191] Add success.
I0320 18:28:13.409819 543705 cpu.go:282] Add success.
W0320 18:28:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:28:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:28:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:28:13.420132 543705 net.go:648] Add success.
I0320 18:28:13.422612 543705 net.go:770] primary dev: ETH0
I0320 18:28:13.422627 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:28:13.422639 543705 net.go:698] Add success.
I0320 18:28:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:28:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:28:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0320 18:28:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:28:14.456601 543705 disk_worker.go:494] system disk:vda1
I0320 18:28:14.456632 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:28:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:28:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:28:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:28:16.458076 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:28:16.472495 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:28:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:28:23.409780 543705 memory.go:184] no items to output this cycle
I0320 18:28:23.409786 543705 cpu.go:275] no items to output this cycle
I0320 18:28:25.578524 543705 disk_info.go:125] begin check local disk info of client
I0320 18:28:25.581084 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:28:25.581091 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb380 0xc0001fb3c0]
E0320 18:28:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:28:33.409781 543705 memory.go:184] no items to output this cycle
I0320 18:28:33.409785 543705 cpu.go:275] no items to output this cycle
E0320 18:28:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:28:43.409814 543705 memory.go:191] Add success.
I0320 18:28:43.409828 543705 cpu.go:282] Add success.
I0320 18:28:43.420072 543705 net.go:648] Add success.
I0320 18:28:43.422873 543705 net.go:770] primary dev: ETH0
I0320 18:28:43.422889 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:28:43.422902 543705 net.go:698] Add success.
I0320 18:28:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:28:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:28:46.458089 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:28:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:28:53.409779 543705 memory.go:184] no items to output this cycle
I0320 18:28:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 18:29:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:29:03.409773 543705 memory.go:184] no items to output this cycle
I0320 18:29:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 18:29:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:29:13.409789 543705 memory.go:191] Add success.
I0320 18:29:13.409795 543705 cpu.go:282] Add success.
W0320 18:29:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:29:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:29:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:29:13.420068 543705 net.go:648] Add success.
I0320 18:29:13.422771 543705 net.go:770] primary dev: ETH0
I0320 18:29:13.422787 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:29:13.422799 543705 net.go:698] Add success.
I0320 18:29:14.454981 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:29:14.455144 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:29:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0320 18:29:14.455216 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:29:14.456606 543705 disk_worker.go:494] system disk:vda1
I0320 18:29:14.456642 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:29:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:29:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:29:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:29:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:29:16.472400 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:29:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:29:23.409802 543705 memory.go:184] no items to output this cycle
I0320 18:29:23.409812 543705 cpu.go:275] no items to output this cycle
I0320 18:29:25.581173 543705 disk_info.go:125] begin check local disk info of client
I0320 18:29:25.583641 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:29:25.583647 543705 disk_info.go:196] parse disk info done, disk is : [0xc000387680 0xc0003876c0]
E0320 18:29:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:29:33.409797 543705 memory.go:184] no items to output this cycle
I0320 18:29:33.409809 543705 cpu.go:275] no items to output this cycle
E0320 18:29:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:29:43.409812 543705 memory.go:191] Add success.
I0320 18:29:43.409829 543705 cpu.go:282] Add success.
I0320 18:29:43.420046 543705 net.go:648] Add success.
I0320 18:29:43.422763 543705 net.go:770] primary dev: ETH0
I0320 18:29:43.422776 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:29:43.422788 543705 net.go:698] Add success.
I0320 18:29:46.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:29:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:29:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:29:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:29:53.409786 543705 memory.go:184] no items to output this cycle
I0320 18:29:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 18:30:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:30:03.409778 543705 memory.go:184] no items to output this cycle
I0320 18:30:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 18:30:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:30:13.409799 543705 memory.go:191] Add success.
I0320 18:30:13.409802 543705 cpu.go:282] Add success.
W0320 18:30:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:30:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:30:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:30:13.420061 543705 net.go:648] Add success.
I0320 18:30:13.422962 543705 net.go:770] primary dev: ETH0
I0320 18:30:13.422975 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:30:13.422988 543705 net.go:698] Add success.
I0320 18:30:13.464790 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3423e485-1b24-4839-ab67-a7e0ce8a211f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 18:30:13.464825 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 18:30:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:30:14.455116 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:30:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 18:30:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:30:14.456595 543705 disk_worker.go:494] system disk:vda1
I0320 18:30:14.456625 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:30:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:30:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:30:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:30:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:30:16.472473 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:30:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:30:23.409783 543705 memory.go:184] no items to output this cycle
I0320 18:30:23.409784 543705 cpu.go:275] no items to output this cycle
I0320 18:30:25.584611 543705 disk_info.go:125] begin check local disk info of client
I0320 18:30:25.587230 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:30:25.587237 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b000 0xc00007b040]
E0320 18:30:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:30:33.409764 543705 memory.go:184] no items to output this cycle
I0320 18:30:33.409803 543705 cpu.go:275] no items to output this cycle
I0320 18:30:38.585692 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 18:30:38.585699 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 18:30:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:30:43.410638 543705 memory.go:191] Add success.
I0320 18:30:43.409821 543705 cpu.go:282] Add success.
I0320 18:30:43.420318 543705 net.go:648] Add success.
I0320 18:30:43.423032 543705 net.go:770] primary dev: ETH0
I0320 18:30:43.423045 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:30:43.423057 543705 net.go:698] Add success.
I0320 18:30:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:30:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:30:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:30:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:30:53.409789 543705 memory.go:184] no items to output this cycle
I0320 18:30:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 18:31:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:31:03.409785 543705 memory.go:184] no items to output this cycle
I0320 18:31:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 18:31:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:31:13.409823 543705 memory.go:191] Add success.
I0320 18:31:13.409823 543705 cpu.go:282] Add success.
W0320 18:31:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:31:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:31:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:31:13.420156 543705 net.go:648] Add success.
I0320 18:31:13.422732 543705 net.go:770] primary dev: ETH0
I0320 18:31:13.422745 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:31:13.422757 543705 net.go:698] Add success.
I0320 18:31:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:31:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:31:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 18:31:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:31:14.456596 543705 disk_worker.go:494] system disk:vda1
I0320 18:31:14.456627 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:31:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:31:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:31:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:31:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:31:16.472423 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:31:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:31:23.409771 543705 memory.go:184] no items to output this cycle
I0320 18:31:23.409782 543705 cpu.go:275] no items to output this cycle
I0320 18:31:25.587598 543705 disk_info.go:125] begin check local disk info of client
I0320 18:31:25.590166 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:31:25.590173 543705 disk_info.go:196] parse disk info done, disk is : [0xc000328000 0xc000328040]
E0320 18:31:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:31:33.409780 543705 memory.go:184] no items to output this cycle
I0320 18:31:33.409791 543705 cpu.go:275] no items to output this cycle
E0320 18:31:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:31:43.409780 543705 memory.go:191] Add success.
I0320 18:31:43.409819 543705 cpu.go:282] Add success.
I0320 18:31:43.419857 543705 net.go:648] Add success.
I0320 18:31:43.422999 543705 net.go:770] primary dev: ETH0
I0320 18:31:43.423013 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:31:43.423024 543705 net.go:698] Add success.
I0320 18:31:46.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:31:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:31:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:31:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:31:53.409791 543705 memory.go:184] no items to output this cycle
I0320 18:31:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 18:32:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:32:03.409805 543705 memory.go:184] no items to output this cycle
I0320 18:32:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 18:32:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:32:13.409779 543705 memory.go:191] Add success.
W0320 18:32:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 18:32:13.409813 543705 cpu.go:282] Add success.
W0320 18:32:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:32:13.409819 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:32:13.420093 543705 net.go:648] Add success.
I0320 18:32:13.422984 543705 net.go:770] primary dev: ETH0
I0320 18:32:13.422998 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:32:13.423009 543705 net.go:698] Add success.
W0320 18:32:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:32:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0320 18:32:14.455175 543705 disk_worker.go:728] disk inode is not compliant
E0320 18:32:14.456937 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 18:32:14.456946 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 18:32:14.456953 543705 custom_config.go:64] query custom config with name: gpu
I0320 18:32:14.457001 543705 disk_worker.go:494] system disk:vda1
I0320 18:32:14.457043 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 18:32:15.456839 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 18:32:15.456848 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:32:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 18:32:16.457977 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 18:32:16.458020 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:32:16.458037 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:32:16.472369 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:32:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:32:23.409781 543705 memory.go:184] no items to output this cycle
I0320 18:32:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 18:32:25.590245 543705 disk_info.go:125] begin check local disk info of client
I0320 18:32:25.592754 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:32:25.592762 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a90c0 0xc0004a9100]
E0320 18:32:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:32:33.409835 543705 memory.go:184] no items to output this cycle
I0320 18:32:33.409908 543705 cpu.go:275] no items to output this cycle
E0320 18:32:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:32:43.409817 543705 memory.go:191] Add success.
I0320 18:32:43.409826 543705 cpu.go:282] Add success.
I0320 18:32:43.419962 543705 net.go:648] Add success.
I0320 18:32:43.422602 543705 net.go:770] primary dev: ETH0
I0320 18:32:43.422614 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:32:43.422626 543705 net.go:698] Add success.
I0320 18:32:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:32:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:32:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:32:53.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:32:53.409809 543705 memory.go:184] no items to output this cycle
I0320 18:32:53.409817 543705 cpu.go:275] no items to output this cycle
E0320 18:33:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:33:03.409806 543705 memory.go:184] no items to output this cycle
I0320 18:33:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 18:33:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:33:13.409819 543705 memory.go:191] Add success.
I0320 18:33:13.409831 543705 cpu.go:282] Add success.
W0320 18:33:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:33:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:33:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:33:13.420205 543705 net.go:648] Add success.
I0320 18:33:13.422795 543705 net.go:770] primary dev: ETH0
I0320 18:33:13.422810 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:33:13.422824 543705 net.go:698] Add success.
I0320 18:33:13.839389 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"88965d1d-e74f-4740-bfd8-42b2d0982577","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 18:33:13.839422 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 18:33:14.454684 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:33:14.454895 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:33:14.454906 543705 disk_worker.go:708] disk space is not compliant
W0320 18:33:14.454909 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:33:14.456240 543705 disk_worker.go:494] system disk:vda1
I0320 18:33:14.456286 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:33:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:33:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:33:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:33:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:33:16.472405 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:33:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:33:23.409776 543705 memory.go:184] no items to output this cycle
I0320 18:33:23.409779 543705 cpu.go:275] no items to output this cycle
I0320 18:33:25.593596 543705 disk_info.go:125] begin check local disk info of client
I0320 18:33:25.596015 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:33:25.596021 543705 disk_info.go:196] parse disk info done, disk is : [0xc000486580 0xc0004865c0]
E0320 18:33:33.409901 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:33:33.409922 543705 memory.go:184] no items to output this cycle
I0320 18:33:33.410013 543705 cpu.go:275] no items to output this cycle
I0320 18:33:38.586634 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 18:33:38.586641 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 18:33:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:33:43.410683 543705 memory.go:191] Add success.
I0320 18:33:43.409812 543705 cpu.go:282] Add success.
I0320 18:33:43.420370 543705 net.go:648] Add success.
I0320 18:33:43.423119 543705 net.go:770] primary dev: ETH0
I0320 18:33:43.423133 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:33:43.423148 543705 net.go:698] Add success.
I0320 18:33:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:33:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:33:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:33:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:33:53.409807 543705 memory.go:184] no items to output this cycle
I0320 18:33:53.409817 543705 cpu.go:275] no items to output this cycle
E0320 18:34:03.409989 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:34:03.410007 543705 memory.go:184] no items to output this cycle
I0320 18:34:03.410018 543705 cpu.go:275] no items to output this cycle
E0320 18:34:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:34:13.409782 543705 memory.go:191] Add success.
W0320 18:34:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 18:34:13.409809 543705 cpu.go:282] Add success.
W0320 18:34:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:34:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:34:13.420104 543705 net.go:648] Add success.
I0320 18:34:13.422769 543705 net.go:770] primary dev: ETH0
I0320 18:34:13.422784 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:34:13.422795 543705 net.go:698] Add success.
I0320 18:34:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:34:14.455197 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:34:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0320 18:34:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:34:14.456596 543705 disk_worker.go:494] system disk:vda1
I0320 18:34:14.456627 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:34:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:34:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:34:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:34:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:34:16.472408 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:34:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:34:23.409805 543705 memory.go:184] no items to output this cycle
I0320 18:34:23.409817 543705 cpu.go:275] no items to output this cycle
I0320 18:34:25.596117 543705 disk_info.go:125] begin check local disk info of client
I0320 18:34:25.598582 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:34:25.598589 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004cc2c0 0xc0004cc300]
E0320 18:34:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:34:33.409795 543705 memory.go:184] no items to output this cycle
I0320 18:34:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 18:34:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:34:43.409785 543705 memory.go:191] Add success.
I0320 18:34:43.409804 543705 cpu.go:282] Add success.
I0320 18:34:43.419885 543705 net.go:648] Add success.
I0320 18:34:43.422406 543705 net.go:770] primary dev: ETH0
I0320 18:34:43.422420 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:34:43.422432 543705 net.go:698] Add success.
I0320 18:34:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:34:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:34:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:34:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:34:53.409777 543705 memory.go:184] no items to output this cycle
I0320 18:34:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 18:35:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:35:03.409779 543705 memory.go:184] no items to output this cycle
I0320 18:35:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 18:35:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:35:13.409786 543705 memory.go:191] Add success.
I0320 18:35:13.409803 543705 cpu.go:282] Add success.
W0320 18:35:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:35:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:35:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:35:13.420064 543705 net.go:648] Add success.
I0320 18:35:13.422999 543705 net.go:770] primary dev: ETH0
I0320 18:35:13.423013 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:35:13.423027 543705 net.go:698] Add success.
I0320 18:35:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:35:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:35:14.455218 543705 disk_worker.go:708] disk space is not compliant
W0320 18:35:14.455221 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:35:14.456588 543705 disk_worker.go:494] system disk:vda1
I0320 18:35:14.456619 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:35:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:35:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:35:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:35:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:35:16.472386 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:35:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:35:23.409793 543705 memory.go:184] no items to output this cycle
I0320 18:35:23.409807 543705 cpu.go:275] no items to output this cycle
I0320 18:35:25.598629 543705 disk_info.go:125] begin check local disk info of client
I0320 18:35:25.601071 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:35:25.601078 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004b5740 0xc0004b5780]
E0320 18:35:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:35:33.409809 543705 memory.go:184] no items to output this cycle
I0320 18:35:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 18:35:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:35:43.409807 543705 memory.go:191] Add success.
I0320 18:35:43.409817 543705 cpu.go:282] Add success.
I0320 18:35:43.419865 543705 net.go:648] Add success.
I0320 18:35:43.422502 543705 net.go:770] primary dev: ETH0
I0320 18:35:43.422515 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:35:43.422527 543705 net.go:698] Add success.
I0320 18:35:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:35:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:35:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:35:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:35:53.409785 543705 memory.go:184] no items to output this cycle
I0320 18:35:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 18:36:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:36:03.409788 543705 memory.go:184] no items to output this cycle
I0320 18:36:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 18:36:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:36:13.409819 543705 memory.go:191] Add success.
I0320 18:36:13.409827 543705 cpu.go:282] Add success.
W0320 18:36:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:36:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:36:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:36:13.420049 543705 net.go:648] Add success.
I0320 18:36:13.422725 543705 net.go:770] primary dev: ETH0
I0320 18:36:13.422739 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:36:13.422752 543705 net.go:698] Add success.
I0320 18:36:13.463305 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"31a5b4b6-7243-4d49-adca-b732358e60d8","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 18:36:13.463340 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 18:36:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:36:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:36:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0320 18:36:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:36:14.456699 543705 disk_worker.go:494] system disk:vda1
I0320 18:36:14.456736 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:36:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:36:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:36:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:36:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:36:16.472389 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:36:23.409892 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:36:23.409918 543705 memory.go:184] no items to output this cycle
I0320 18:36:23.410075 543705 cpu.go:275] no items to output this cycle
I0320 18:36:25.601651 543705 disk_info.go:125] begin check local disk info of client
I0320 18:36:25.604059 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:36:25.604065 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034c140 0xc00034c180]
E0320 18:36:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:36:33.409795 543705 memory.go:184] no items to output this cycle
I0320 18:36:33.409810 543705 cpu.go:275] no items to output this cycle
I0320 18:36:38.587684 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 18:36:38.587690 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 18:36:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:36:43.410679 543705 memory.go:191] Add success.
I0320 18:36:43.409824 543705 cpu.go:282] Add success.
I0320 18:36:43.420439 543705 net.go:648] Add success.
I0320 18:36:43.423022 543705 net.go:770] primary dev: ETH0
I0320 18:36:43.423035 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:36:43.423047 543705 net.go:698] Add success.
I0320 18:36:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:36:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:36:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:36:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:36:53.409782 543705 memory.go:184] no items to output this cycle
I0320 18:36:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 18:37:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:37:03.409785 543705 cpu.go:275] no items to output this cycle
I0320 18:37:03.409792 543705 memory.go:184] no items to output this cycle
E0320 18:37:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:37:13.409793 543705 memory.go:191] Add success.
I0320 18:37:13.409796 543705 cpu.go:282] Add success.
W0320 18:37:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:37:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:37:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:37:13.420059 543705 net.go:648] Add success.
I0320 18:37:13.422899 543705 net.go:770] primary dev: ETH0
I0320 18:37:13.422913 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:37:13.422926 543705 net.go:698] Add success.
I0320 18:37:13.453489 543705 event_worker.go:152] Polling the log file for events...
W0320 18:37:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:37:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 18:37:14.455188 543705 disk_worker.go:728] disk inode is not compliant
E0320 18:37:14.455880 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 18:37:14.455888 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 18:37:14.455895 543705 custom_config.go:64] query custom config with name: gpu
I0320 18:37:14.456578 543705 disk_worker.go:494] system disk:vda1
I0320 18:37:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 18:37:15.456808 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 18:37:15.456817 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:37:16.457912 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 18:37:16.457912 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 18:37:16.457967 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:37:16.457987 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:37:16.472349 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:37:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:37:23.409872 543705 cpu.go:275] no items to output this cycle
I0320 18:37:23.409890 543705 memory.go:184] no items to output this cycle
I0320 18:37:25.604661 543705 disk_info.go:125] begin check local disk info of client
I0320 18:37:25.607118 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:37:25.607124 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a9240 0xc0004a9280]
E0320 18:37:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:37:33.409774 543705 memory.go:184] no items to output this cycle
I0320 18:37:33.409781 543705 cpu.go:275] no items to output this cycle
E0320 18:37:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:37:43.409790 543705 memory.go:191] Add success.
I0320 18:37:43.409791 543705 cpu.go:282] Add success.
I0320 18:37:43.420056 543705 net.go:648] Add success.
I0320 18:37:43.422748 543705 net.go:770] primary dev: ETH0
I0320 18:37:43.422762 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:37:43.422774 543705 net.go:698] Add success.
I0320 18:37:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:37:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:37:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:37:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:37:53.409810 543705 memory.go:184] no items to output this cycle
I0320 18:37:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 18:38:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:38:03.409789 543705 cpu.go:275] no items to output this cycle
I0320 18:38:03.409801 543705 memory.go:184] no items to output this cycle
E0320 18:38:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:38:13.409792 543705 memory.go:191] Add success.
I0320 18:38:13.409792 543705 cpu.go:282] Add success.
W0320 18:38:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:38:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:38:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:38:13.420056 543705 net.go:648] Add success.
I0320 18:38:13.422825 543705 net.go:770] primary dev: ETH0
I0320 18:38:13.422838 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:38:13.422850 543705 net.go:698] Add success.
I0320 18:38:14.454955 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:38:14.455116 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:38:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 18:38:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:38:14.456573 543705 disk_worker.go:494] system disk:vda1
I0320 18:38:14.456603 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:38:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:38:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:38:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:38:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:38:16.472393 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:38:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:38:23.409772 543705 memory.go:184] no items to output this cycle
I0320 18:38:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 18:38:25.607205 543705 disk_info.go:125] begin check local disk info of client
I0320 18:38:25.609668 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:38:25.609674 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a4000 0xc0002a4040]
E0320 18:38:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:38:33.409804 543705 memory.go:184] no items to output this cycle
I0320 18:38:33.409820 543705 cpu.go:275] no items to output this cycle
E0320 18:38:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:38:43.409779 543705 memory.go:191] Add success.
I0320 18:38:43.409807 543705 cpu.go:282] Add success.
I0320 18:38:43.419881 543705 net.go:648] Add success.
I0320 18:38:43.422398 543705 net.go:770] primary dev: ETH0
I0320 18:38:43.422410 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:38:43.422422 543705 net.go:698] Add success.
I0320 18:38:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:38:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:38:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:38:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:38:53.409808 543705 memory.go:184] no items to output this cycle
I0320 18:38:53.409818 543705 cpu.go:275] no items to output this cycle
E0320 18:39:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:39:03.409809 543705 memory.go:184] no items to output this cycle
I0320 18:39:03.409825 543705 cpu.go:275] no items to output this cycle
E0320 18:39:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:39:13.409793 543705 memory.go:191] Add success.
I0320 18:39:13.409807 543705 cpu.go:282] Add success.
W0320 18:39:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:39:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:39:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:39:13.420096 543705 net.go:648] Add success.
I0320 18:39:13.422673 543705 net.go:770] primary dev: ETH0
I0320 18:39:13.422690 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:39:13.422705 543705 net.go:698] Add success.
I0320 18:39:13.469077 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"13d45302-7618-4d71-8800-3a5e7e0c20b2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 18:39:13.469112 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 18:39:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:39:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:39:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 18:39:14.455173 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:39:14.456533 543705 disk_worker.go:494] system disk:vda1
I0320 18:39:14.456590 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:39:15.455624 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:39:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:39:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:39:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:39:16.472409 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:39:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:39:23.409773 543705 memory.go:184] no items to output this cycle
I0320 18:39:23.409799 543705 cpu.go:275] no items to output this cycle
I0320 18:39:25.610695 543705 disk_info.go:125] begin check local disk info of client
I0320 18:39:25.613130 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:39:25.613136 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa000 0xc0001aa040]
E0320 18:39:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:39:33.409764 543705 memory.go:184] no items to output this cycle
I0320 18:39:33.409806 543705 cpu.go:275] no items to output this cycle
I0320 18:39:38.588673 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 18:39:38.588680 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 18:39:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:39:43.410632 543705 memory.go:191] Add success.
I0320 18:39:43.409824 543705 cpu.go:282] Add success.
I0320 18:39:43.420378 543705 net.go:648] Add success.
I0320 18:39:43.423118 543705 net.go:770] primary dev: ETH0
I0320 18:39:43.423131 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:39:43.423144 543705 net.go:698] Add success.
I0320 18:39:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:39:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:39:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:39:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:39:53.409784 543705 memory.go:184] no items to output this cycle
I0320 18:39:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 18:40:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:40:03.409779 543705 memory.go:184] no items to output this cycle
I0320 18:40:03.409848 543705 cpu.go:275] no items to output this cycle
E0320 18:40:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:40:13.409795 543705 memory.go:191] Add success.
I0320 18:40:13.409815 543705 cpu.go:282] Add success.
W0320 18:40:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:40:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:40:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:40:13.420245 543705 net.go:648] Add success.
I0320 18:40:13.423398 543705 net.go:770] primary dev: ETH0
I0320 18:40:13.423411 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:40:13.423422 543705 net.go:698] Add success.
I0320 18:40:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:40:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:40:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 18:40:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:40:14.456591 543705 disk_worker.go:494] system disk:vda1
I0320 18:40:14.456621 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:40:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:40:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:40:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:40:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:40:16.472403 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:40:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:40:23.409780 543705 memory.go:184] no items to output this cycle
I0320 18:40:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 18:40:25.613671 543705 disk_info.go:125] begin check local disk info of client
I0320 18:40:25.616136 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:40:25.616143 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004da080 0xc0004da0c0]
E0320 18:40:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:40:33.409774 543705 memory.go:184] no items to output this cycle
I0320 18:40:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 18:40:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:40:43.409799 543705 memory.go:191] Add success.
I0320 18:40:43.409810 543705 cpu.go:282] Add success.
I0320 18:40:43.420066 543705 net.go:648] Add success.
I0320 18:40:43.422828 543705 net.go:770] primary dev: ETH0
I0320 18:40:43.422842 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:40:43.422856 543705 net.go:698] Add success.
I0320 18:40:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:40:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:40:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:40:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:40:53.409799 543705 memory.go:184] no items to output this cycle
I0320 18:40:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 18:41:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:41:03.409823 543705 memory.go:184] no items to output this cycle
I0320 18:41:03.409830 543705 cpu.go:275] no items to output this cycle
E0320 18:41:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:41:13.409778 543705 memory.go:191] Add success.
W0320 18:41:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 18:41:13.409811 543705 cpu.go:282] Add success.
W0320 18:41:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:41:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:41:13.420055 543705 net.go:648] Add success.
I0320 18:41:13.422582 543705 net.go:770] primary dev: ETH0
I0320 18:41:13.422596 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:41:13.422609 543705 net.go:698] Add success.
I0320 18:41:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:41:14.455197 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:41:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0320 18:41:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:41:14.456602 543705 disk_worker.go:494] system disk:vda1
I0320 18:41:14.456633 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:41:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:41:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:41:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:41:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:41:16.472435 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:41:23.410613 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:41:23.410629 543705 memory.go:184] no items to output this cycle
I0320 18:41:23.410646 543705 cpu.go:275] no items to output this cycle
I0320 18:41:25.616718 543705 disk_info.go:125] begin check local disk info of client
I0320 18:41:25.619185 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:41:25.619192 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004863c0 0xc000486400]
E0320 18:41:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:41:33.409773 543705 memory.go:184] no items to output this cycle
I0320 18:41:33.409780 543705 cpu.go:275] no items to output this cycle
E0320 18:41:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:41:43.409824 543705 memory.go:191] Add success.
I0320 18:41:43.409833 543705 cpu.go:282] Add success.
I0320 18:41:43.420056 543705 net.go:648] Add success.
I0320 18:41:43.425606 543705 net.go:770] primary dev: ETH0
I0320 18:41:43.425619 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:41:43.425630 543705 net.go:698] Add success.
I0320 18:41:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:41:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:41:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:41:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:41:53.409809 543705 memory.go:184] no items to output this cycle
I0320 18:41:53.409814 543705 cpu.go:275] no items to output this cycle
E0320 18:42:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:42:03.409806 543705 memory.go:184] no items to output this cycle
I0320 18:42:03.409819 543705 cpu.go:275] no items to output this cycle
E0320 18:42:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:42:13.409781 543705 memory.go:191] Add success.
I0320 18:42:13.409805 543705 cpu.go:282] Add success.
W0320 18:42:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:42:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:42:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:42:13.420112 543705 net.go:648] Add success.
I0320 18:42:13.423314 543705 net.go:770] primary dev: ETH0
I0320 18:42:13.423329 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:42:13.423340 543705 net.go:698] Add success.
I0320 18:42:13.463795 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4ec839d9-c2f0-431a-80b6-c302562f8f67","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 18:42:13.463829 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 18:42:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:42:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0320 18:42:14.455172 543705 disk_worker.go:728] disk inode is not compliant
E0320 18:42:14.455944 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 18:42:14.455953 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 18:42:14.455958 543705 custom_config.go:64] query custom config with name: gpu
I0320 18:42:14.456444 543705 disk_worker.go:494] system disk:vda1
I0320 18:42:14.456471 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 18:42:15.456795 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 18:42:15.456803 543705 custom_config.go:64] query custom config with name: huawei_npu
E0320 18:42:16.457908 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 18:42:16.457912 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:42:16.457985 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:42:16.458004 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:42:16.472339 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:42:23.410358 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:42:23.410454 543705 cpu.go:275] no items to output this cycle
I0320 18:42:23.410508 543705 memory.go:184] no items to output this cycle
I0320 18:42:25.619734 543705 disk_info.go:125] begin check local disk info of client
I0320 18:42:25.622248 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:42:25.622254 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048b1c0 0xc00048b200]
E0320 18:42:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:42:33.409774 543705 memory.go:184] no items to output this cycle
I0320 18:42:33.409783 543705 cpu.go:275] no items to output this cycle
I0320 18:42:38.589673 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 18:42:38.589679 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 18:42:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:42:43.410654 543705 memory.go:191] Add success.
I0320 18:42:43.409823 543705 cpu.go:282] Add success.
I0320 18:42:43.420338 543705 net.go:648] Add success.
I0320 18:42:43.423197 543705 net.go:770] primary dev: ETH0
I0320 18:42:43.423210 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:42:43.423221 543705 net.go:698] Add success.
I0320 18:42:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:42:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:42:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:42:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:42:53.409773 543705 memory.go:184] no items to output this cycle
I0320 18:42:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 18:43:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:43:03.409811 543705 memory.go:184] no items to output this cycle
I0320 18:43:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 18:43:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:43:13.409790 543705 memory.go:191] Add success.
I0320 18:43:13.409790 543705 cpu.go:282] Add success.
W0320 18:43:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:43:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:43:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:43:13.420104 543705 net.go:648] Add success.
I0320 18:43:13.423314 543705 net.go:770] primary dev: ETH0
I0320 18:43:13.423328 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:43:13.423340 543705 net.go:698] Add success.
I0320 18:43:14.454952 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:43:14.455203 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:43:14.455213 543705 disk_worker.go:708] disk space is not compliant
W0320 18:43:14.455216 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:43:14.456593 543705 disk_worker.go:494] system disk:vda1
I0320 18:43:14.456625 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:43:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:43:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:43:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:43:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:43:16.472446 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:43:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:43:23.409886 543705 cpu.go:275] no items to output this cycle
I0320 18:43:23.409889 543705 memory.go:184] no items to output this cycle
I0320 18:43:25.622753 543705 disk_info.go:125] begin check local disk info of client
I0320 18:43:25.625187 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:43:25.625193 543705 disk_info.go:196] parse disk info done, disk is : [0xc000329440 0xc000329480]
E0320 18:43:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:43:33.409795 543705 memory.go:184] no items to output this cycle
I0320 18:43:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 18:43:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:43:43.409794 543705 memory.go:191] Add success.
I0320 18:43:43.409800 543705 cpu.go:282] Add success.
I0320 18:43:43.419962 543705 net.go:648] Add success.
I0320 18:43:43.422762 543705 net.go:770] primary dev: ETH0
I0320 18:43:43.422777 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:43:43.422792 543705 net.go:698] Add success.
I0320 18:43:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:43:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:43:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:43:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:43:53.409769 543705 memory.go:184] no items to output this cycle
I0320 18:43:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 18:44:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:44:03.409787 543705 memory.go:184] no items to output this cycle
I0320 18:44:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 18:44:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:44:13.409806 543705 memory.go:191] Add success.
I0320 18:44:13.409812 543705 cpu.go:282] Add success.
W0320 18:44:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:44:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:44:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:44:13.420070 543705 net.go:648] Add success.
I0320 18:44:13.422751 543705 net.go:770] primary dev: ETH0
I0320 18:44:13.422764 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:44:13.422777 543705 net.go:698] Add success.
I0320 18:44:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:44:14.455101 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:44:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0320 18:44:14.455180 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:44:14.456512 543705 disk_worker.go:494] system disk:vda1
I0320 18:44:14.456558 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:44:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:44:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:44:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:44:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:44:16.472371 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:44:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:44:23.409763 543705 memory.go:184] no items to output this cycle
I0320 18:44:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 18:44:25.625673 543705 disk_info.go:125] begin check local disk info of client
I0320 18:44:25.628132 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:44:25.628138 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ba000 0xc0003ba040]
E0320 18:44:33.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:44:33.409762 543705 memory.go:184] no items to output this cycle
I0320 18:44:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 18:44:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:44:43.409811 543705 memory.go:191] Add success.
I0320 18:44:43.409816 543705 cpu.go:282] Add success.
I0320 18:44:43.419970 543705 net.go:648] Add success.
I0320 18:44:43.422783 543705 net.go:770] primary dev: ETH0
I0320 18:44:43.422795 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:44:43.422807 543705 net.go:698] Add success.
I0320 18:44:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:44:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:44:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:44:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:44:53.409808 543705 memory.go:184] no items to output this cycle
I0320 18:44:53.409818 543705 cpu.go:275] no items to output this cycle
E0320 18:45:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:45:03.409782 543705 memory.go:184] no items to output this cycle
I0320 18:45:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 18:45:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:45:13.409808 543705 memory.go:191] Add success.
I0320 18:45:13.409817 543705 cpu.go:282] Add success.
W0320 18:45:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:45:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:45:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:45:13.420048 543705 net.go:648] Add success.
I0320 18:45:13.422870 543705 net.go:770] primary dev: ETH0
I0320 18:45:13.422883 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:45:13.422895 543705 net.go:698] Add success.
I0320 18:45:13.469156 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8c78c04d-9fef-4468-98bd-471a0d423560","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 18:45:13.469188 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 18:45:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:45:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:45:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 18:45:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:45:14.456599 543705 disk_worker.go:494] system disk:vda1
I0320 18:45:14.456630 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:45:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:45:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:45:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:45:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:45:16.472393 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:45:23.409830 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:45:23.409848 543705 memory.go:184] no items to output this cycle
I0320 18:45:23.409963 543705 cpu.go:275] no items to output this cycle
I0320 18:45:25.628219 543705 disk_info.go:125] begin check local disk info of client
I0320 18:45:25.630679 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:45:25.630686 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032fcc0 0xc00032fd00]
E0320 18:45:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:45:33.409799 543705 memory.go:184] no items to output this cycle
I0320 18:45:33.409809 543705 cpu.go:275] no items to output this cycle
I0320 18:45:38.589824 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 18:45:38.589831 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 18:45:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:45:43.410578 543705 memory.go:191] Add success.
I0320 18:45:43.409800 543705 cpu.go:282] Add success.
I0320 18:45:43.420304 543705 net.go:648] Add success.
I0320 18:45:43.422837 543705 net.go:770] primary dev: ETH0
I0320 18:45:43.422849 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:45:43.422862 543705 net.go:698] Add success.
I0320 18:45:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:45:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:45:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:45:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:45:53.409772 543705 memory.go:184] no items to output this cycle
I0320 18:45:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 18:46:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:46:03.409810 543705 memory.go:184] no items to output this cycle
I0320 18:46:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 18:46:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:46:13.409785 543705 memory.go:191] Add success.
I0320 18:46:13.409803 543705 cpu.go:282] Add success.
W0320 18:46:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:46:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:46:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:46:13.420137 543705 net.go:648] Add success.
I0320 18:46:13.422847 543705 net.go:770] primary dev: ETH0
I0320 18:46:13.422861 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:46:13.422874 543705 net.go:698] Add success.
I0320 18:46:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:46:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:46:14.455160 543705 disk_worker.go:708] disk space is not compliant
W0320 18:46:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:46:14.456501 543705 disk_worker.go:494] system disk:vda1
I0320 18:46:14.456546 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:46:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:46:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:46:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:46:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:46:16.472399 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:46:23.409853 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:46:23.409864 543705 cpu.go:275] no items to output this cycle
I0320 18:46:23.409872 543705 memory.go:184] no items to output this cycle
I0320 18:46:25.630768 543705 disk_info.go:125] begin check local disk info of client
I0320 18:46:25.633218 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:46:25.633224 543705 disk_info.go:196] parse disk info done, disk is : [0xc000386d00 0xc000386d40]
E0320 18:46:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:46:33.409792 543705 memory.go:184] no items to output this cycle
I0320 18:46:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 18:46:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:46:43.409793 543705 memory.go:191] Add success.
I0320 18:46:43.409798 543705 cpu.go:282] Add success.
I0320 18:46:43.419864 543705 net.go:648] Add success.
I0320 18:46:43.422574 543705 net.go:770] primary dev: ETH0
I0320 18:46:43.422586 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:46:43.422598 543705 net.go:698] Add success.
I0320 18:46:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:46:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:46:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:46:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:46:53.409779 543705 memory.go:184] no items to output this cycle
I0320 18:46:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 18:47:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:47:03.409786 543705 memory.go:184] no items to output this cycle
I0320 18:47:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 18:47:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:47:13.409790 543705 memory.go:191] Add success.
I0320 18:47:13.409791 543705 cpu.go:282] Add success.
W0320 18:47:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:47:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:47:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:47:13.420215 543705 net.go:648] Add success.
I0320 18:47:13.422933 543705 net.go:770] primary dev: ETH0
I0320 18:47:13.422947 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:47:13.422961 543705 net.go:698] Add success.
I0320 18:47:13.453509 543705 event_worker.go:152] Polling the log file for events...
W0320 18:47:14.455238 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:47:14.455255 543705 disk_worker.go:708] disk space is not compliant
W0320 18:47:14.455259 543705 disk_worker.go:728] disk inode is not compliant
E0320 18:47:14.455915 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 18:47:14.455924 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 18:47:14.455930 543705 custom_config.go:64] query custom config with name: gpu
I0320 18:47:14.456839 543705 disk_worker.go:494] system disk:vda1
I0320 18:47:14.456870 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 18:47:15.456855 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 18:47:15.456864 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:47:16.457925 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 18:47:16.457934 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 18:47:16.457978 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:47:16.457993 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:47:16.472350 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:47:23.409867 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:47:23.409899 543705 cpu.go:275] no items to output this cycle
I0320 18:47:23.409973 543705 memory.go:184] no items to output this cycle
I0320 18:47:25.633673 543705 disk_info.go:125] begin check local disk info of client
I0320 18:47:25.636091 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:47:25.636096 543705 disk_info.go:196] parse disk info done, disk is : [0xc000331a00 0xc000331a40]
E0320 18:47:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:47:33.409772 543705 memory.go:184] no items to output this cycle
I0320 18:47:33.409785 543705 cpu.go:275] no items to output this cycle
E0320 18:47:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:47:43.409788 543705 memory.go:191] Add success.
I0320 18:47:43.409788 543705 cpu.go:282] Add success.
I0320 18:47:43.419974 543705 net.go:648] Add success.
I0320 18:47:43.423275 543705 net.go:770] primary dev: ETH0
I0320 18:47:43.423289 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:47:43.423301 543705 net.go:698] Add success.
I0320 18:47:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:47:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:47:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:47:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:47:53.409779 543705 memory.go:184] no items to output this cycle
I0320 18:47:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 18:48:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:48:03.409805 543705 memory.go:184] no items to output this cycle
I0320 18:48:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 18:48:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:48:13.409800 543705 memory.go:191] Add success.
I0320 18:48:13.409800 543705 cpu.go:282] Add success.
W0320 18:48:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:48:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:48:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:48:13.420183 543705 net.go:648] Add success.
I0320 18:48:13.422818 543705 net.go:770] primary dev: ETH0
I0320 18:48:13.422833 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:48:13.422845 543705 net.go:698] Add success.
I0320 18:48:13.470047 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a700ff0c-93b0-4806-b1e7-113c75185ffa","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 18:48:13.470079 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 18:48:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:48:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:48:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 18:48:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:48:14.456595 543705 disk_worker.go:494] system disk:vda1
I0320 18:48:14.456627 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:48:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:48:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:48:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:48:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:48:16.472396 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:48:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:48:23.409766 543705 memory.go:184] no items to output this cycle
I0320 18:48:23.409802 543705 cpu.go:275] no items to output this cycle
I0320 18:48:25.636822 543705 disk_info.go:125] begin check local disk info of client
I0320 18:48:25.639298 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:48:25.639304 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004e3d40 0xc0004e3d80]
E0320 18:48:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:48:33.409777 543705 memory.go:184] no items to output this cycle
I0320 18:48:33.409785 543705 cpu.go:275] no items to output this cycle
I0320 18:48:38.590650 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 18:48:38.590656 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 18:48:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:48:43.410836 543705 memory.go:191] Add success.
I0320 18:48:43.409817 543705 cpu.go:282] Add success.
I0320 18:48:43.420339 543705 net.go:770] primary dev: ETH0
I0320 18:48:43.420351 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:48:43.420364 543705 net.go:698] Add success.
I0320 18:48:43.420704 543705 net.go:648] Add success.
I0320 18:48:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:48:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:48:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:48:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:48:53.409785 543705 memory.go:184] no items to output this cycle
I0320 18:48:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 18:49:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:49:03.409778 543705 memory.go:184] no items to output this cycle
I0320 18:49:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 18:49:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:49:13.409806 543705 memory.go:191] Add success.
I0320 18:49:13.409810 543705 cpu.go:282] Add success.
W0320 18:49:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:49:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:49:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:49:13.419869 543705 net.go:770] primary dev: ETH0
I0320 18:49:13.419884 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:49:13.419898 543705 net.go:698] Add success.
I0320 18:49:13.420265 543705 net.go:648] Add success.
I0320 18:49:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:49:14.455206 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:49:14.455217 543705 disk_worker.go:708] disk space is not compliant
W0320 18:49:14.455219 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:49:14.456600 543705 disk_worker.go:494] system disk:vda1
I0320 18:49:14.456634 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:49:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:49:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:49:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:49:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:49:16.472396 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:49:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:49:23.409781 543705 memory.go:184] no items to output this cycle
I0320 18:49:23.409781 543705 cpu.go:275] no items to output this cycle
I0320 18:49:25.639389 543705 disk_info.go:125] begin check local disk info of client
I0320 18:49:25.641916 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:49:25.641923 543705 disk_info.go:196] parse disk info done, disk is : [0xc000325000 0xc000325040]
E0320 18:49:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:49:33.409775 543705 memory.go:184] no items to output this cycle
I0320 18:49:33.409791 543705 cpu.go:275] no items to output this cycle
E0320 18:49:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:49:43.409788 543705 memory.go:191] Add success.
I0320 18:49:43.409803 543705 cpu.go:282] Add success.
I0320 18:49:43.419859 543705 net.go:648] Add success.
I0320 18:49:43.422614 543705 net.go:770] primary dev: ETH0
I0320 18:49:43.422626 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:49:43.422652 543705 net.go:698] Add success.
I0320 18:49:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:49:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:49:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:49:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:49:53.409796 543705 memory.go:184] no items to output this cycle
I0320 18:49:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 18:50:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:50:03.409809 543705 memory.go:184] no items to output this cycle
I0320 18:50:03.409825 543705 cpu.go:275] no items to output this cycle
E0320 18:50:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:50:13.409824 543705 memory.go:191] Add success.
I0320 18:50:13.409828 543705 cpu.go:282] Add success.
W0320 18:50:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:50:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:50:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:50:13.420069 543705 net.go:648] Add success.
I0320 18:50:13.423122 543705 net.go:770] primary dev: ETH0
I0320 18:50:13.423137 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:50:13.423151 543705 net.go:698] Add success.
I0320 18:50:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:50:14.455118 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:50:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0320 18:50:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:50:14.456590 543705 disk_worker.go:494] system disk:vda1
I0320 18:50:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:50:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:50:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:50:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:50:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:50:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:50:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:50:23.409780 543705 memory.go:184] no items to output this cycle
I0320 18:50:23.409797 543705 cpu.go:275] no items to output this cycle
I0320 18:50:25.642011 543705 disk_info.go:125] begin check local disk info of client
I0320 18:50:25.644457 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:50:25.644464 543705 disk_info.go:196] parse disk info done, disk is : [0xc000266000 0xc000266040]
E0320 18:50:33.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:50:33.409820 543705 memory.go:184] no items to output this cycle
I0320 18:50:33.409827 543705 cpu.go:275] no items to output this cycle
E0320 18:50:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:50:43.409797 543705 memory.go:191] Add success.
I0320 18:50:43.409799 543705 cpu.go:282] Add success.
I0320 18:50:43.419990 543705 net.go:648] Add success.
I0320 18:50:43.422823 543705 net.go:770] primary dev: ETH0
I0320 18:50:43.422838 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:50:43.422853 543705 net.go:698] Add success.
I0320 18:50:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:50:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:50:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:50:53.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:50:53.409820 543705 memory.go:184] no items to output this cycle
I0320 18:50:53.409825 543705 cpu.go:275] no items to output this cycle
E0320 18:51:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:51:03.409789 543705 memory.go:184] no items to output this cycle
I0320 18:51:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 18:51:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:51:13.409794 543705 memory.go:191] Add success.
I0320 18:51:13.409805 543705 cpu.go:282] Add success.
W0320 18:51:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:51:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:51:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:51:13.420142 543705 net.go:648] Add success.
I0320 18:51:13.423173 543705 net.go:770] primary dev: ETH0
I0320 18:51:13.423186 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:51:13.423199 543705 net.go:698] Add success.
I0320 18:51:13.464652 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"026ee7fe-ec02-41c2-9c08-b2acd676caea","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 18:51:13.464685 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 18:51:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:51:14.455133 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:51:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0320 18:51:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:51:14.456610 543705 disk_worker.go:494] system disk:vda1
I0320 18:51:14.456640 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:51:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:51:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:51:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:51:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:51:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:51:23.410356 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:51:23.410370 543705 memory.go:184] no items to output this cycle
I0320 18:51:23.410372 543705 cpu.go:275] no items to output this cycle
I0320 18:51:25.644865 543705 disk_info.go:125] begin check local disk info of client
I0320 18:51:25.647325 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:51:25.647332 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048df00 0xc00048df40]
E0320 18:51:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:51:33.409774 543705 memory.go:184] no items to output this cycle
I0320 18:51:33.409799 543705 cpu.go:275] no items to output this cycle
I0320 18:51:38.591703 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 18:51:38.591710 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 18:51:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:51:43.410744 543705 memory.go:191] Add success.
I0320 18:51:43.409803 543705 cpu.go:282] Add success.
I0320 18:51:43.420506 543705 net.go:648] Add success.
I0320 18:51:43.423283 543705 net.go:770] primary dev: ETH0
I0320 18:51:43.423296 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:51:43.423308 543705 net.go:698] Add success.
I0320 18:51:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:51:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:51:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:51:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:51:53.409792 543705 memory.go:184] no items to output this cycle
I0320 18:51:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 18:52:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:52:03.409777 543705 memory.go:184] no items to output this cycle
I0320 18:52:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 18:52:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:52:13.409818 543705 memory.go:191] Add success.
I0320 18:52:13.409818 543705 cpu.go:282] Add success.
W0320 18:52:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:52:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:52:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:52:13.420308 543705 net.go:648] Add success.
I0320 18:52:13.423053 543705 net.go:770] primary dev: ETH0
I0320 18:52:13.423066 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:52:13.423078 543705 net.go:698] Add success.
W0320 18:52:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:52:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 18:52:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:52:14.456786 543705 disk_worker.go:494] system disk:vda1
I0320 18:52:14.456826 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 18:52:14.457133 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 18:52:14.457140 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 18:52:14.457145 543705 custom_config.go:64] query custom config with name: gpu
E0320 18:52:15.456828 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 18:52:15.456836 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:52:16.457922 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 18:52:16.457927 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 18:52:16.457977 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:52:16.458001 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:52:16.472347 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:52:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:52:23.409770 543705 memory.go:184] no items to output this cycle
I0320 18:52:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 18:52:25.647414 543705 disk_info.go:125] begin check local disk info of client
I0320 18:52:25.649896 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:52:25.649902 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb080 0xc0001fb0c0]
E0320 18:52:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:52:33.409773 543705 memory.go:184] no items to output this cycle
I0320 18:52:33.409781 543705 cpu.go:275] no items to output this cycle
E0320 18:52:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:52:43.409790 543705 memory.go:191] Add success.
I0320 18:52:43.409794 543705 cpu.go:282] Add success.
I0320 18:52:43.419887 543705 net.go:648] Add success.
I0320 18:52:43.422634 543705 net.go:770] primary dev: ETH0
I0320 18:52:43.422647 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:52:43.422659 543705 net.go:698] Add success.
I0320 18:52:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:52:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:52:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:52:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:52:53.409786 543705 memory.go:184] no items to output this cycle
I0320 18:52:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 18:53:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:53:03.409805 543705 memory.go:184] no items to output this cycle
I0320 18:53:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 18:53:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:53:13.409783 543705 memory.go:191] Add success.
W0320 18:53:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 18:53:13.409807 543705 cpu.go:282] Add success.
W0320 18:53:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:53:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:53:13.420213 543705 net.go:648] Add success.
I0320 18:53:13.422954 543705 net.go:770] primary dev: ETH0
I0320 18:53:13.422967 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:53:13.422979 543705 net.go:698] Add success.
I0320 18:53:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:53:14.455209 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:53:14.455221 543705 disk_worker.go:708] disk space is not compliant
W0320 18:53:14.455224 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:53:14.456593 543705 disk_worker.go:494] system disk:vda1
I0320 18:53:14.456625 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:53:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:53:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:53:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:53:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:53:16.472398 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:53:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:53:23.409810 543705 memory.go:184] no items to output this cycle
I0320 18:53:23.409821 543705 cpu.go:275] no items to output this cycle
I0320 18:53:25.649982 543705 disk_info.go:125] begin check local disk info of client
I0320 18:53:25.652424 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:53:25.652430 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b400 0xc00007b440]
E0320 18:53:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:53:33.409795 543705 memory.go:184] no items to output this cycle
I0320 18:53:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 18:53:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:53:43.409790 543705 memory.go:191] Add success.
I0320 18:53:43.409791 543705 cpu.go:282] Add success.
I0320 18:53:43.419974 543705 net.go:648] Add success.
I0320 18:53:43.422579 543705 net.go:770] primary dev: ETH0
I0320 18:53:43.422594 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:53:43.422608 543705 net.go:698] Add success.
I0320 18:53:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:53:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:53:46.458090 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:53:53.410590 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:53:53.410613 543705 memory.go:184] no items to output this cycle
I0320 18:53:53.410627 543705 cpu.go:275] no items to output this cycle
E0320 18:54:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:54:03.409777 543705 memory.go:184] no items to output this cycle
I0320 18:54:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 18:54:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:54:13.409813 543705 memory.go:191] Add success.
I0320 18:54:13.409818 543705 cpu.go:282] Add success.
W0320 18:54:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:54:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:54:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:54:13.420241 543705 net.go:648] Add success.
I0320 18:54:13.423213 543705 net.go:770] primary dev: ETH0
I0320 18:54:13.423226 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:54:13.423238 543705 net.go:698] Add success.
I0320 18:54:13.545348 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"92a0a95a-8238-4933-9db1-8a9eaf8437ef","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 18:54:13.545378 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 18:54:14.453983 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:54:14.454170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:54:14.454257 543705 disk_worker.go:708] disk space is not compliant
W0320 18:54:14.454261 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:54:14.455797 543705 disk_worker.go:494] system disk:vda1
I0320 18:54:14.455828 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:54:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:54:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:54:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:54:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:54:16.472390 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:54:23.409824 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:54:23.409846 543705 memory.go:184] no items to output this cycle
I0320 18:54:23.409910 543705 cpu.go:275] no items to output this cycle
I0320 18:54:25.652509 543705 disk_info.go:125] begin check local disk info of client
I0320 18:54:25.654978 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:54:25.654984 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002af9c0 0xc0002afa00]
E0320 18:54:33.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:54:33.409758 543705 memory.go:184] no items to output this cycle
I0320 18:54:33.409800 543705 cpu.go:275] no items to output this cycle
I0320 18:54:38.592667 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 18:54:38.592673 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 18:54:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:54:43.410691 543705 memory.go:191] Add success.
I0320 18:54:43.409804 543705 cpu.go:282] Add success.
I0320 18:54:43.420439 543705 net.go:648] Add success.
I0320 18:54:43.423152 543705 net.go:770] primary dev: ETH0
I0320 18:54:43.423181 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:54:43.423195 543705 net.go:698] Add success.
I0320 18:54:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:54:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:54:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:54:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:54:53.409809 543705 memory.go:184] no items to output this cycle
I0320 18:54:53.409815 543705 cpu.go:275] no items to output this cycle
E0320 18:55:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:55:03.409775 543705 memory.go:184] no items to output this cycle
I0320 18:55:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 18:55:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:55:13.409808 543705 memory.go:191] Add success.
I0320 18:55:13.409817 543705 cpu.go:282] Add success.
W0320 18:55:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:55:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:55:13.409858 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:55:13.420099 543705 net.go:648] Add success.
I0320 18:55:13.422698 543705 net.go:770] primary dev: ETH0
I0320 18:55:13.422711 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:55:13.422724 543705 net.go:698] Add success.
I0320 18:55:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:55:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:55:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 18:55:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:55:14.456489 543705 disk_worker.go:494] system disk:vda1
I0320 18:55:14.456534 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:55:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:55:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:55:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:55:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:55:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:55:23.409847 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:55:23.409864 543705 memory.go:184] no items to output this cycle
I0320 18:55:23.409943 543705 cpu.go:275] no items to output this cycle
I0320 18:55:25.655911 543705 disk_info.go:125] begin check local disk info of client
I0320 18:55:25.658363 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:55:25.658369 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b4240 0xc0002b4280]
E0320 18:55:33.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:55:33.409759 543705 memory.go:184] no items to output this cycle
I0320 18:55:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 18:55:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:55:43.409815 543705 memory.go:191] Add success.
I0320 18:55:43.409819 543705 cpu.go:282] Add success.
I0320 18:55:43.420011 543705 net.go:648] Add success.
I0320 18:55:43.422753 543705 net.go:770] primary dev: ETH0
I0320 18:55:43.422768 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:55:43.422782 543705 net.go:698] Add success.
I0320 18:55:46.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:55:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:55:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:55:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:55:53.409784 543705 memory.go:184] no items to output this cycle
I0320 18:55:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 18:56:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:56:03.409805 543705 memory.go:184] no items to output this cycle
I0320 18:56:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 18:56:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:56:13.409821 543705 memory.go:191] Add success.
I0320 18:56:13.409825 543705 cpu.go:282] Add success.
W0320 18:56:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:56:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:56:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:56:13.420260 543705 net.go:648] Add success.
I0320 18:56:13.422982 543705 net.go:770] primary dev: ETH0
I0320 18:56:13.422996 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:56:13.423010 543705 net.go:698] Add success.
I0320 18:56:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:56:14.455111 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:56:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0320 18:56:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:56:14.456505 543705 disk_worker.go:494] system disk:vda1
I0320 18:56:14.456549 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:56:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:56:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:56:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:56:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:56:16.472406 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:56:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:56:23.409776 543705 cpu.go:275] no items to output this cycle
I0320 18:56:23.409784 543705 memory.go:184] no items to output this cycle
I0320 18:56:25.658926 543705 disk_info.go:125] begin check local disk info of client
I0320 18:56:25.661371 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:56:25.661468 543705 disk_info.go:196] parse disk info done, disk is : [0xc00052ff40 0xc000530000]
E0320 18:56:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:56:33.409800 543705 memory.go:184] no items to output this cycle
I0320 18:56:33.409814 543705 cpu.go:275] no items to output this cycle
E0320 18:56:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:56:43.409782 543705 memory.go:191] Add success.
I0320 18:56:43.409799 543705 cpu.go:282] Add success.
I0320 18:56:43.419972 543705 net.go:648] Add success.
I0320 18:56:43.422517 543705 net.go:770] primary dev: ETH0
I0320 18:56:43.422530 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:56:43.422543 543705 net.go:698] Add success.
I0320 18:56:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:56:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:56:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:56:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:56:53.409786 543705 memory.go:184] no items to output this cycle
I0320 18:56:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 18:57:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:57:03.409779 543705 memory.go:184] no items to output this cycle
I0320 18:57:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 18:57:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:57:13.409789 543705 memory.go:191] Add success.
I0320 18:57:13.409805 543705 cpu.go:282] Add success.
W0320 18:57:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:57:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:57:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:57:13.420210 543705 net.go:648] Add success.
I0320 18:57:13.428726 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 18:57:13.428801 543705 net.go:770] primary dev: ETH0
I0320 18:57:13.428813 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:57:13.428825 543705 net.go:698] Add success.
I0320 18:57:13.453383 543705 event_worker.go:152] Polling the log file for events...
I0320 18:57:13.463994 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"66cb085d-f745-4fb5-8067-b973c26a1ac5","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 18:57:13.464026 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 18:57:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:57:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 18:57:14.455185 543705 disk_worker.go:728] disk inode is not compliant
E0320 18:57:14.455897 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 18:57:14.455906 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 18:57:14.455911 543705 custom_config.go:64] query custom config with name: gpu
I0320 18:57:14.456540 543705 disk_worker.go:494] system disk:vda1
I0320 18:57:14.456570 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 18:57:15.456797 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 18:57:15.456806 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:57:16.457901 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 18:57:16.457901 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 18:57:16.457954 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:57:16.457973 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:57:16.472291 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:57:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:57:23.409794 543705 memory.go:184] no items to output this cycle
I0320 18:57:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 18:57:25.661672 543705 disk_info.go:125] begin check local disk info of client
I0320 18:57:25.664089 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:57:25.664095 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034fec0 0xc00034ff00]
E0320 18:57:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:57:33.409767 543705 memory.go:184] no items to output this cycle
I0320 18:57:33.409808 543705 cpu.go:275] no items to output this cycle
I0320 18:57:38.593683 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 18:57:38.593690 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 18:57:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:57:43.410901 543705 memory.go:191] Add success.
I0320 18:57:43.409804 543705 cpu.go:282] Add success.
I0320 18:57:43.420677 543705 net.go:648] Add success.
I0320 18:57:43.423573 543705 net.go:770] primary dev: ETH0
I0320 18:57:43.423587 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:57:43.423601 543705 net.go:698] Add success.
I0320 18:57:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:57:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:57:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:57:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:57:53.409782 543705 memory.go:184] no items to output this cycle
I0320 18:57:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 18:58:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:58:03.409809 543705 memory.go:184] no items to output this cycle
I0320 18:58:03.409822 543705 cpu.go:275] no items to output this cycle
E0320 18:58:13.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:58:13.409784 543705 memory.go:191] Add success.
W0320 18:58:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 18:58:13.409811 543705 cpu.go:282] Add success.
W0320 18:58:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:58:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:58:13.420078 543705 net.go:648] Add success.
I0320 18:58:13.423003 543705 net.go:770] primary dev: ETH0
I0320 18:58:13.423017 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:58:13.423029 543705 net.go:698] Add success.
I0320 18:58:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:58:14.455174 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:58:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 18:58:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:58:14.456573 543705 disk_worker.go:494] system disk:vda1
I0320 18:58:14.456605 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:58:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:58:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:58:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:58:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:58:16.472431 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:58:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:58:23.409792 543705 memory.go:184] no items to output this cycle
I0320 18:58:23.409802 543705 cpu.go:275] no items to output this cycle
I0320 18:58:25.664968 543705 disk_info.go:125] begin check local disk info of client
I0320 18:58:25.667429 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:58:25.667434 543705 disk_info.go:196] parse disk info done, disk is : [0xc000543480 0xc0005434c0]
E0320 18:58:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:58:33.409796 543705 memory.go:184] no items to output this cycle
I0320 18:58:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 18:58:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:58:43.409786 543705 memory.go:191] Add success.
I0320 18:58:43.409818 543705 cpu.go:282] Add success.
I0320 18:58:43.419848 543705 net.go:648] Add success.
I0320 18:58:43.422488 543705 net.go:770] primary dev: ETH0
I0320 18:58:43.422500 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:58:43.422513 543705 net.go:698] Add success.
I0320 18:58:46.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:58:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:58:46.458091 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:58:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:58:53.409776 543705 memory.go:184] no items to output this cycle
I0320 18:58:53.409816 543705 cpu.go:275] no items to output this cycle
E0320 18:59:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:59:03.409796 543705 cpu.go:275] no items to output this cycle
I0320 18:59:03.409805 543705 memory.go:184] no items to output this cycle
E0320 18:59:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:59:13.409818 543705 memory.go:191] Add success.
I0320 18:59:13.409826 543705 cpu.go:282] Add success.
W0320 18:59:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 18:59:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 18:59:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 18:59:13.420088 543705 net.go:648] Add success.
I0320 18:59:13.422907 543705 net.go:770] primary dev: ETH0
I0320 18:59:13.422920 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:59:13.422933 543705 net.go:698] Add success.
I0320 18:59:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 18:59:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 18:59:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0320 18:59:14.455180 543705 disk_worker.go:728] disk inode is not compliant
I0320 18:59:14.456500 543705 disk_worker.go:494] system disk:vda1
I0320 18:59:14.456545 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 18:59:15.455983 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 18:59:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:59:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:59:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 18:59:16.472381 543705 disk_local_worker.go:436] Get disk info: []
E0320 18:59:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:59:23.409800 543705 memory.go:184] no items to output this cycle
I0320 18:59:23.409813 543705 cpu.go:275] no items to output this cycle
I0320 18:59:25.667969 543705 disk_info.go:125] begin check local disk info of client
I0320 18:59:25.670626 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 18:59:25.670632 543705 disk_info.go:196] parse disk info done, disk is : [0xc00046c640 0xc00046c680]
E0320 18:59:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:59:33.409902 543705 memory.go:184] no items to output this cycle
I0320 18:59:33.409916 543705 cpu.go:275] no items to output this cycle
E0320 18:59:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:59:43.409824 543705 memory.go:191] Add success.
I0320 18:59:43.409834 543705 cpu.go:282] Add success.
I0320 18:59:43.420037 543705 net.go:648] Add success.
I0320 18:59:43.422866 543705 net.go:770] primary dev: ETH0
I0320 18:59:43.422879 543705 net.go:802] Send network stats successfully!,count is 6
I0320 18:59:43.422890 543705 net.go:698] Add success.
I0320 18:59:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 18:59:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 18:59:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 18:59:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 18:59:53.409778 543705 memory.go:184] no items to output this cycle
I0320 18:59:53.409816 543705 cpu.go:275] no items to output this cycle
E0320 19:00:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:00:03.409795 543705 memory.go:184] no items to output this cycle
I0320 19:00:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 19:00:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:00:13.409817 543705 memory.go:191] Add success.
I0320 19:00:13.409827 543705 cpu.go:282] Add success.
W0320 19:00:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:00:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:00:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:00:13.420101 543705 net.go:648] Add success.
I0320 19:00:13.423115 543705 net.go:770] primary dev: ETH0
I0320 19:00:13.423128 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:00:13.423141 543705 net.go:698] Add success.
I0320 19:00:13.471427 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5565a369-1e03-4d9a-921e-3b95897419c6","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 19:00:13.471460 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 19:00:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:00:14.455133 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:00:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0320 19:00:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:00:14.456595 543705 disk_worker.go:494] system disk:vda1
I0320 19:00:14.456627 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:00:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:00:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:00:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:00:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:00:16.472386 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:00:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:00:23.409774 543705 memory.go:184] no items to output this cycle
I0320 19:00:23.409780 543705 cpu.go:275] no items to output this cycle
I0320 19:00:25.670712 543705 disk_info.go:125] begin check local disk info of client
I0320 19:00:25.673147 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:00:25.673153 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004ca100 0xc0004ca140]
E0320 19:00:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:00:33.409919 543705 memory.go:184] no items to output this cycle
I0320 19:00:33.409938 543705 cpu.go:275] no items to output this cycle
I0320 19:00:38.593839 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 19:00:38.593845 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 19:00:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:00:43.410670 543705 memory.go:191] Add success.
I0320 19:00:43.409814 543705 cpu.go:282] Add success.
I0320 19:00:43.420402 543705 net.go:648] Add success.
I0320 19:00:43.423168 543705 net.go:770] primary dev: ETH0
I0320 19:00:43.423185 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:00:43.423200 543705 net.go:698] Add success.
I0320 19:00:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:00:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:00:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:00:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:00:53.409781 543705 memory.go:184] no items to output this cycle
I0320 19:00:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 19:01:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:01:03.409787 543705 cpu.go:275] no items to output this cycle
I0320 19:01:03.409802 543705 memory.go:184] no items to output this cycle
E0320 19:01:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:01:13.409795 543705 memory.go:191] Add success.
I0320 19:01:13.409809 543705 cpu.go:282] Add success.
W0320 19:01:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:01:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:01:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:01:13.420080 543705 net.go:648] Add success.
I0320 19:01:13.422898 543705 net.go:770] primary dev: ETH0
I0320 19:01:13.422912 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:01:13.422926 543705 net.go:698] Add success.
I0320 19:01:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:01:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:01:14.455160 543705 disk_worker.go:708] disk space is not compliant
W0320 19:01:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:01:14.456486 543705 disk_worker.go:494] system disk:vda1
I0320 19:01:14.456528 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:01:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:01:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:01:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:01:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:01:16.472379 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:01:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:01:23.409792 543705 memory.go:184] no items to output this cycle
I0320 19:01:23.409805 543705 cpu.go:275] no items to output this cycle
I0320 19:01:25.673670 543705 disk_info.go:125] begin check local disk info of client
I0320 19:01:25.676143 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:01:25.676148 543705 disk_info.go:196] parse disk info done, disk is : [0xc000358700 0xc000358740]
E0320 19:01:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:01:33.409790 543705 memory.go:184] no items to output this cycle
I0320 19:01:33.409803 543705 cpu.go:275] no items to output this cycle
E0320 19:01:43.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:01:43.409831 543705 memory.go:191] Add success.
I0320 19:01:43.409838 543705 cpu.go:282] Add success.
I0320 19:01:43.420010 543705 net.go:648] Add success.
I0320 19:01:43.422705 543705 net.go:770] primary dev: ETH0
I0320 19:01:43.422718 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:01:43.422732 543705 net.go:698] Add success.
I0320 19:01:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:01:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:01:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:01:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:01:53.409807 543705 memory.go:184] no items to output this cycle
I0320 19:01:53.409819 543705 cpu.go:275] no items to output this cycle
E0320 19:02:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:02:03.409790 543705 memory.go:184] no items to output this cycle
I0320 19:02:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 19:02:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:02:13.409789 543705 memory.go:191] Add success.
I0320 19:02:13.409791 543705 cpu.go:282] Add success.
W0320 19:02:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:02:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:02:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:02:13.420163 543705 net.go:648] Add success.
I0320 19:02:13.423024 543705 net.go:770] primary dev: ETH0
I0320 19:02:13.423039 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:02:13.423051 543705 net.go:698] Add success.
W0320 19:02:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:02:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 19:02:14.455192 543705 disk_worker.go:728] disk inode is not compliant
E0320 19:02:14.455909 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 19:02:14.455918 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 19:02:14.455924 543705 custom_config.go:64] query custom config with name: gpu
I0320 19:02:14.456555 543705 disk_worker.go:494] system disk:vda1
I0320 19:02:14.456585 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 19:02:15.456807 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 19:02:15.456817 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:02:16.457943 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 19:02:16.457945 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 19:02:16.458010 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:02:16.458030 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:02:16.472356 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:02:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:02:23.409763 543705 memory.go:184] no items to output this cycle
I0320 19:02:23.409800 543705 cpu.go:275] no items to output this cycle
I0320 19:02:25.676228 543705 disk_info.go:125] begin check local disk info of client
I0320 19:02:25.678675 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:02:25.678681 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035d7c0 0xc00035d800]
E0320 19:02:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:02:33.409791 543705 memory.go:184] no items to output this cycle
I0320 19:02:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 19:02:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:02:43.409890 543705 memory.go:191] Add success.
I0320 19:02:43.409929 543705 cpu.go:282] Add success.
I0320 19:02:43.419728 543705 net.go:648] Add success.
I0320 19:02:43.422695 543705 net.go:770] primary dev: ETH0
I0320 19:02:43.422709 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:02:43.422724 543705 net.go:698] Add success.
I0320 19:02:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:02:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:02:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:02:53.410257 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:02:53.410275 543705 memory.go:184] no items to output this cycle
I0320 19:02:53.410279 543705 cpu.go:275] no items to output this cycle
E0320 19:03:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:03:03.409779 543705 memory.go:184] no items to output this cycle
I0320 19:03:03.409782 543705 cpu.go:275] no items to output this cycle
E0320 19:03:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:03:13.409815 543705 memory.go:191] Add success.
I0320 19:03:13.409828 543705 cpu.go:282] Add success.
W0320 19:03:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:03:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:03:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:03:13.420147 543705 net.go:648] Add success.
I0320 19:03:13.423147 543705 net.go:770] primary dev: ETH0
I0320 19:03:13.423162 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:03:13.423174 543705 net.go:698] Add success.
I0320 19:03:13.469922 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ea546bf6-8842-4184-a389-a73105332933","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 19:03:13.469957 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 19:03:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:03:14.455185 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:03:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 19:03:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:03:14.456554 543705 disk_worker.go:494] system disk:vda1
I0320 19:03:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:03:15.455616 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:03:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:03:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:03:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:03:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:03:23.410414 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:03:23.410426 543705 cpu.go:275] no items to output this cycle
I0320 19:03:23.410429 543705 memory.go:184] no items to output this cycle
I0320 19:03:25.678761 543705 disk_info.go:125] begin check local disk info of client
I0320 19:03:25.681243 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:03:25.681248 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037c5c0 0xc00037c600]
E0320 19:03:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:03:33.409797 543705 memory.go:184] no items to output this cycle
I0320 19:03:33.409809 543705 cpu.go:275] no items to output this cycle
I0320 19:03:38.593980 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 19:03:38.593986 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 19:03:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:03:43.410932 543705 memory.go:191] Add success.
I0320 19:03:43.409823 543705 cpu.go:282] Add success.
I0320 19:03:43.419940 543705 net.go:648] Add success.
I0320 19:03:43.423120 543705 net.go:770] primary dev: ETH0
I0320 19:03:43.423133 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:03:43.423145 543705 net.go:698] Add success.
I0320 19:03:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:03:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:03:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:03:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:03:53.409773 543705 memory.go:184] no items to output this cycle
I0320 19:03:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 19:04:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:04:03.409777 543705 memory.go:184] no items to output this cycle
I0320 19:04:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 19:04:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:04:13.409817 543705 memory.go:191] Add success.
I0320 19:04:13.409822 543705 cpu.go:282] Add success.
W0320 19:04:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:04:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:04:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:04:13.420578 543705 net.go:648] Add success.
I0320 19:04:13.423606 543705 net.go:770] primary dev: ETH0
I0320 19:04:13.423618 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:04:13.423630 543705 net.go:698] Add success.
I0320 19:04:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:04:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:04:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 19:04:14.455223 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:04:14.456583 543705 disk_worker.go:494] system disk:vda1
I0320 19:04:14.456613 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:04:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:04:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:04:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:04:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:04:16.472409 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:04:23.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:04:23.409759 543705 memory.go:184] no items to output this cycle
I0320 19:04:23.409798 543705 cpu.go:275] no items to output this cycle
I0320 19:04:25.681670 543705 disk_info.go:125] begin check local disk info of client
I0320 19:04:25.684084 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:04:25.684090 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003be080 0xc0003be0c0]
E0320 19:04:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:04:33.409802 543705 memory.go:184] no items to output this cycle
I0320 19:04:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 19:04:43.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:04:43.409779 543705 memory.go:191] Add success.
I0320 19:04:43.409809 543705 cpu.go:282] Add success.
I0320 19:04:43.419875 543705 net.go:648] Add success.
I0320 19:04:43.422564 543705 net.go:770] primary dev: ETH0
I0320 19:04:43.422576 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:04:43.422588 543705 net.go:698] Add success.
I0320 19:04:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:04:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:04:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:04:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:04:53.409812 543705 memory.go:184] no items to output this cycle
I0320 19:04:53.409817 543705 cpu.go:275] no items to output this cycle
E0320 19:05:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:05:03.409805 543705 memory.go:184] no items to output this cycle
I0320 19:05:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 19:05:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:05:13.409786 543705 memory.go:191] Add success.
I0320 19:05:13.409807 543705 cpu.go:282] Add success.
W0320 19:05:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:05:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:05:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:05:13.420310 543705 net.go:648] Add success.
I0320 19:05:13.423307 543705 net.go:770] primary dev: ETH0
I0320 19:05:13.423321 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:05:13.423334 543705 net.go:698] Add success.
I0320 19:05:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:05:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:05:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0320 19:05:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:05:14.456502 543705 disk_worker.go:494] system disk:vda1
I0320 19:05:14.456557 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:05:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:05:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:05:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:05:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:05:16.472442 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:05:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:05:23.409776 543705 memory.go:184] no items to output this cycle
I0320 19:05:23.409775 543705 cpu.go:275] no items to output this cycle
I0320 19:05:25.685061 543705 disk_info.go:125] begin check local disk info of client
I0320 19:05:25.687522 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:05:25.687528 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003be0c0 0xc0003be100]
E0320 19:05:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:05:33.409774 543705 memory.go:184] no items to output this cycle
I0320 19:05:33.409786 543705 cpu.go:275] no items to output this cycle
E0320 19:05:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:05:43.409798 543705 memory.go:191] Add success.
I0320 19:05:43.409803 543705 cpu.go:282] Add success.
I0320 19:05:43.419892 543705 net.go:648] Add success.
I0320 19:05:43.422859 543705 net.go:770] primary dev: ETH0
I0320 19:05:43.422874 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:05:43.422889 543705 net.go:698] Add success.
I0320 19:05:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:05:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:05:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:05:53.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:05:53.409810 543705 memory.go:184] no items to output this cycle
I0320 19:05:53.409822 543705 cpu.go:275] no items to output this cycle
E0320 19:06:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:06:03.409775 543705 memory.go:184] no items to output this cycle
I0320 19:06:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 19:06:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:06:13.409813 543705 memory.go:191] Add success.
I0320 19:06:13.409821 543705 cpu.go:282] Add success.
W0320 19:06:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:06:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:06:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:06:13.420312 543705 net.go:648] Add success.
I0320 19:06:13.423160 543705 net.go:770] primary dev: ETH0
I0320 19:06:13.423172 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:06:13.423184 543705 net.go:698] Add success.
I0320 19:06:14.014072 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"059b32e5-21bd-4465-95e2-8ab22d183b5a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 19:06:14.014110 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 19:06:14.454681 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:06:14.454858 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:06:14.454936 543705 disk_worker.go:708] disk space is not compliant
W0320 19:06:14.454939 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:06:14.456427 543705 disk_worker.go:494] system disk:vda1
I0320 19:06:14.456461 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:06:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:06:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:06:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:06:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:06:16.472380 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:06:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:06:23.409767 543705 memory.go:184] no items to output this cycle
I0320 19:06:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 19:06:25.688082 543705 disk_info.go:125] begin check local disk info of client
I0320 19:06:25.690535 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:06:25.690541 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003be600 0xc0003be640]
E0320 19:06:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:06:33.409773 543705 memory.go:184] no items to output this cycle
I0320 19:06:33.409801 543705 cpu.go:275] no items to output this cycle
I0320 19:06:38.594122 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 19:06:38.594129 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 19:06:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:06:43.410686 543705 memory.go:191] Add success.
I0320 19:06:43.409807 543705 cpu.go:282] Add success.
I0320 19:06:43.420369 543705 net.go:648] Add success.
I0320 19:06:43.423026 543705 net.go:770] primary dev: ETH0
I0320 19:06:43.423039 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:06:43.423051 543705 net.go:698] Add success.
I0320 19:06:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:06:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:06:46.458155 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:06:53.410369 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:06:53.410389 543705 memory.go:184] no items to output this cycle
I0320 19:06:53.410392 543705 cpu.go:275] no items to output this cycle
E0320 19:07:03.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:07:03.409818 543705 memory.go:184] no items to output this cycle
I0320 19:07:03.409841 543705 cpu.go:275] no items to output this cycle
E0320 19:07:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:07:13.409809 543705 memory.go:191] Add success.
I0320 19:07:13.409817 543705 cpu.go:282] Add success.
W0320 19:07:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:07:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:07:13.409853 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:07:13.420226 543705 net.go:648] Add success.
I0320 19:07:13.422880 543705 net.go:770] primary dev: ETH0
I0320 19:07:13.422892 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:07:13.422904 543705 net.go:698] Add success.
I0320 19:07:13.453461 543705 event_worker.go:152] Polling the log file for events...
W0320 19:07:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:07:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0320 19:07:14.455171 543705 disk_worker.go:728] disk inode is not compliant
E0320 19:07:14.456952 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 19:07:14.456961 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 19:07:14.456967 543705 custom_config.go:64] query custom config with name: gpu
I0320 19:07:14.457012 543705 disk_worker.go:494] system disk:vda1
I0320 19:07:14.457040 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 19:07:15.456828 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 19:07:15.456837 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:07:16.457910 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 19:07:16.457910 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 19:07:16.457967 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:07:16.457988 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:07:16.472321 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:07:23.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:07:23.409814 543705 memory.go:184] no items to output this cycle
I0320 19:07:23.409825 543705 cpu.go:275] no items to output this cycle
I0320 19:07:25.690696 543705 disk_info.go:125] begin check local disk info of client
I0320 19:07:25.693105 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:07:25.693111 543705 disk_info.go:196] parse disk info done, disk is : [0xc000266a00 0xc000266a40]
E0320 19:07:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:07:33.409772 543705 memory.go:184] no items to output this cycle
I0320 19:07:33.409778 543705 cpu.go:275] no items to output this cycle
E0320 19:07:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:07:43.409798 543705 memory.go:191] Add success.
I0320 19:07:43.409811 543705 cpu.go:282] Add success.
I0320 19:07:43.419882 543705 net.go:648] Add success.
I0320 19:07:43.422657 543705 net.go:770] primary dev: ETH0
I0320 19:07:43.422670 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:07:43.422683 543705 net.go:698] Add success.
I0320 19:07:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:07:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:07:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:07:53.409888 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:07:53.409913 543705 memory.go:184] no items to output this cycle
I0320 19:07:53.409972 543705 cpu.go:275] no items to output this cycle
E0320 19:08:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:08:03.409783 543705 memory.go:184] no items to output this cycle
I0320 19:08:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 19:08:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:08:13.409796 543705 memory.go:191] Add success.
I0320 19:08:13.409795 543705 cpu.go:282] Add success.
W0320 19:08:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:08:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:08:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:08:13.420180 543705 net.go:648] Add success.
I0320 19:08:13.422684 543705 net.go:770] primary dev: ETH0
I0320 19:08:13.422699 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:08:13.422713 543705 net.go:698] Add success.
I0320 19:08:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:08:14.455196 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:08:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 19:08:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:08:14.456570 543705 disk_worker.go:494] system disk:vda1
I0320 19:08:14.456600 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:08:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:08:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:08:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:08:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:08:16.472379 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:08:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:08:23.409782 543705 memory.go:184] no items to output this cycle
I0320 19:08:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 19:08:25.693670 543705 disk_info.go:125] begin check local disk info of client
I0320 19:08:25.696152 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:08:25.696158 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b2180 0xc0003b21c0]
E0320 19:08:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:08:33.409794 543705 memory.go:184] no items to output this cycle
I0320 19:08:33.409809 543705 cpu.go:275] no items to output this cycle
E0320 19:08:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:08:43.409801 543705 memory.go:191] Add success.
I0320 19:08:43.409801 543705 cpu.go:282] Add success.
I0320 19:08:43.419936 543705 net.go:648] Add success.
I0320 19:08:43.422561 543705 net.go:770] primary dev: ETH0
I0320 19:08:43.422576 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:08:43.422590 543705 net.go:698] Add success.
I0320 19:08:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:08:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:08:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:08:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:08:53.409779 543705 memory.go:184] no items to output this cycle
I0320 19:08:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 19:09:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:09:03.409805 543705 memory.go:184] no items to output this cycle
I0320 19:09:03.409819 543705 cpu.go:275] no items to output this cycle
E0320 19:09:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:09:13.409778 543705 memory.go:191] Add success.
W0320 19:09:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 19:09:13.409805 543705 cpu.go:282] Add success.
W0320 19:09:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:09:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:09:13.420233 543705 net.go:648] Add success.
I0320 19:09:13.422731 543705 net.go:770] primary dev: ETH0
I0320 19:09:13.422745 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:09:13.422758 543705 net.go:698] Add success.
I0320 19:09:13.604133 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"87f8270a-6c51-4afa-b8fc-5ee8e76bbcf9","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 19:09:13.604167 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 19:09:14.454953 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:09:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:09:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 19:09:14.455182 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:09:14.456514 543705 disk_worker.go:494] system disk:vda1
I0320 19:09:14.456558 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:09:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:09:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:09:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:09:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:09:16.472380 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:09:23.410287 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:09:23.410307 543705 memory.go:184] no items to output this cycle
I0320 19:09:23.410314 543705 cpu.go:275] no items to output this cycle
I0320 19:09:25.697114 543705 disk_info.go:125] begin check local disk info of client
I0320 19:09:25.699600 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:09:25.699606 543705 disk_info.go:196] parse disk info done, disk is : [0xc000375ec0 0xc000375f00]
E0320 19:09:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:09:33.409783 543705 cpu.go:275] no items to output this cycle
I0320 19:09:33.409790 543705 memory.go:184] no items to output this cycle
I0320 19:09:38.594668 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 19:09:38.594675 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 19:09:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:09:43.410784 543705 memory.go:191] Add success.
I0320 19:09:43.409797 543705 cpu.go:282] Add success.
I0320 19:09:43.420495 543705 net.go:648] Add success.
I0320 19:09:43.423447 543705 net.go:770] primary dev: ETH0
I0320 19:09:43.423460 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:09:43.423472 543705 net.go:698] Add success.
I0320 19:09:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:09:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:09:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:09:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:09:53.409788 543705 memory.go:184] no items to output this cycle
I0320 19:09:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 19:10:03.409879 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:10:03.409894 543705 cpu.go:275] no items to output this cycle
I0320 19:10:03.409900 543705 memory.go:184] no items to output this cycle
E0320 19:10:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:10:13.409775 543705 memory.go:191] Add success.
W0320 19:10:13.409801 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 19:10:13.409801 543705 cpu.go:282] Add success.
W0320 19:10:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:10:13.409815 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:10:13.420326 543705 net.go:648] Add success.
I0320 19:10:13.422983 543705 net.go:770] primary dev: ETH0
I0320 19:10:13.422996 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:10:13.423007 543705 net.go:698] Add success.
I0320 19:10:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:10:14.455124 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:10:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0320 19:10:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:10:14.456581 543705 disk_worker.go:494] system disk:vda1
I0320 19:10:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:10:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:10:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:10:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:10:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:10:16.472381 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:10:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:10:23.409794 543705 memory.go:184] no items to output this cycle
I0320 19:10:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 19:10:25.699689 543705 disk_info.go:125] begin check local disk info of client
I0320 19:10:25.702202 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:10:25.702208 543705 disk_info.go:196] parse disk info done, disk is : [0xc000270380 0xc0002703c0]
E0320 19:10:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:10:33.409781 543705 memory.go:184] no items to output this cycle
I0320 19:10:33.409783 543705 cpu.go:275] no items to output this cycle
E0320 19:10:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:10:43.409795 543705 cpu.go:282] Add success.
I0320 19:10:43.409800 543705 memory.go:191] Add success.
I0320 19:10:43.419868 543705 net.go:648] Add success.
I0320 19:10:43.422351 543705 net.go:770] primary dev: ETH0
I0320 19:10:43.422363 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:10:43.422375 543705 net.go:698] Add success.
I0320 19:10:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:10:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:10:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:10:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:10:53.409785 543705 memory.go:184] no items to output this cycle
I0320 19:10:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 19:11:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:11:03.409795 543705 memory.go:184] no items to output this cycle
I0320 19:11:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 19:11:13.409903 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:11:13.409978 543705 memory.go:191] Add success.
W0320 19:11:13.410006 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:11:13.410020 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:11:13.410023 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:11:13.410126 543705 cpu.go:282] Add success.
I0320 19:11:13.419716 543705 net.go:648] Add success.
I0320 19:11:13.422981 543705 net.go:770] primary dev: ETH0
I0320 19:11:13.422994 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:11:13.423005 543705 net.go:698] Add success.
I0320 19:11:14.454949 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:11:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:11:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0320 19:11:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:11:14.456590 543705 disk_worker.go:494] system disk:vda1
I0320 19:11:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:11:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:11:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:11:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:11:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:11:16.472468 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:11:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:11:23.409792 543705 memory.go:184] no items to output this cycle
I0320 19:11:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 19:11:25.703155 543705 disk_info.go:125] begin check local disk info of client
I0320 19:11:25.705606 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:11:25.705612 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048ad00 0xc00048ad40]
E0320 19:11:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:11:33.409787 543705 memory.go:184] no items to output this cycle
I0320 19:11:33.409838 543705 cpu.go:275] no items to output this cycle
E0320 19:11:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:11:43.409778 543705 memory.go:191] Add success.
I0320 19:11:43.409801 543705 cpu.go:282] Add success.
I0320 19:11:43.419871 543705 net.go:648] Add success.
I0320 19:11:43.422845 543705 net.go:770] primary dev: ETH0
I0320 19:11:43.422864 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:11:43.422879 543705 net.go:698] Add success.
I0320 19:11:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:11:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:11:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:11:53.409866 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:11:53.409881 543705 cpu.go:275] no items to output this cycle
I0320 19:11:53.409888 543705 memory.go:184] no items to output this cycle
E0320 19:12:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:12:03.409789 543705 memory.go:184] no items to output this cycle
I0320 19:12:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 19:12:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:12:13.409786 543705 memory.go:191] Add success.
I0320 19:12:13.409808 543705 cpu.go:282] Add success.
W0320 19:12:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:12:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:12:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:12:13.420178 543705 net.go:648] Add success.
I0320 19:12:13.422876 543705 net.go:770] primary dev: ETH0
I0320 19:12:13.422893 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:12:13.422908 543705 net.go:698] Add success.
I0320 19:12:13.469936 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5b32a110-03d6-4e0a-94ac-d5777233e34b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 19:12:13.469970 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 19:12:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:12:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 19:12:14.455192 543705 disk_worker.go:728] disk inode is not compliant
E0320 19:12:14.455945 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 19:12:14.455954 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 19:12:14.455959 543705 custom_config.go:64] query custom config with name: gpu
I0320 19:12:14.456583 543705 disk_worker.go:494] system disk:vda1
I0320 19:12:14.456612 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 19:12:15.456831 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 19:12:15.456840 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:12:16.457929 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 19:12:16.457938 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 19:12:16.457980 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:12:16.457996 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:12:16.472315 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:12:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:12:23.409771 543705 cpu.go:275] no items to output this cycle
I0320 19:12:23.409785 543705 memory.go:184] no items to output this cycle
I0320 19:12:25.705676 543705 disk_info.go:125] begin check local disk info of client
I0320 19:12:25.708167 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:12:25.708176 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039c000 0xc00039c040]
E0320 19:12:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:12:33.409806 543705 memory.go:184] no items to output this cycle
I0320 19:12:33.409817 543705 cpu.go:275] no items to output this cycle
I0320 19:12:38.594819 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 19:12:38.594825 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 19:12:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:12:43.410685 543705 memory.go:191] Add success.
I0320 19:12:43.409804 543705 cpu.go:282] Add success.
I0320 19:12:43.420656 543705 net.go:648] Add success.
I0320 19:12:43.423366 543705 net.go:770] primary dev: ETH0
I0320 19:12:43.423379 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:12:43.423391 543705 net.go:698] Add success.
I0320 19:12:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:12:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:12:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:12:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:12:53.409808 543705 memory.go:184] no items to output this cycle
I0320 19:12:53.409826 543705 cpu.go:275] no items to output this cycle
E0320 19:13:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:13:03.409775 543705 memory.go:184] no items to output this cycle
I0320 19:13:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 19:13:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:13:13.409793 543705 memory.go:191] Add success.
I0320 19:13:13.409798 543705 cpu.go:282] Add success.
W0320 19:13:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:13:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:13:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:13:13.420169 543705 net.go:648] Add success.
I0320 19:13:13.422913 543705 net.go:770] primary dev: ETH0
I0320 19:13:13.422927 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:13:13.422939 543705 net.go:698] Add success.
I0320 19:13:14.454955 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:13:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:13:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0320 19:13:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:13:14.456507 543705 disk_worker.go:494] system disk:vda1
I0320 19:13:14.456551 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:13:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:13:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:13:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:13:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:13:16.472369 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:13:23.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:13:23.409760 543705 memory.go:184] no items to output this cycle
I0320 19:13:23.409799 543705 cpu.go:275] no items to output this cycle
I0320 19:13:25.709172 543705 disk_info.go:125] begin check local disk info of client
I0320 19:13:25.711718 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:13:25.711725 543705 disk_info.go:196] parse disk info done, disk is : [0xc000282800 0xc000282840]
E0320 19:13:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:13:33.409773 543705 memory.go:184] no items to output this cycle
I0320 19:13:33.409913 543705 cpu.go:275] no items to output this cycle
E0320 19:13:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:13:43.409786 543705 memory.go:191] Add success.
I0320 19:13:43.409810 543705 cpu.go:282] Add success.
I0320 19:13:43.419936 543705 net.go:648] Add success.
I0320 19:13:43.422465 543705 net.go:770] primary dev: ETH0
I0320 19:13:43.422478 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:13:43.422490 543705 net.go:698] Add success.
I0320 19:13:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:13:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:13:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:13:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:13:53.409785 543705 cpu.go:275] no items to output this cycle
I0320 19:13:53.409791 543705 memory.go:184] no items to output this cycle
E0320 19:14:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:14:03.409809 543705 memory.go:184] no items to output this cycle
I0320 19:14:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 19:14:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:14:13.409786 543705 memory.go:191] Add success.
I0320 19:14:13.409807 543705 cpu.go:282] Add success.
W0320 19:14:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:14:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:14:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:14:13.420074 543705 net.go:648] Add success.
I0320 19:14:13.422726 543705 net.go:770] primary dev: ETH0
I0320 19:14:13.422739 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:14:13.422753 543705 net.go:698] Add success.
I0320 19:14:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:14:14.455162 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:14:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0320 19:14:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:14:14.456523 543705 disk_worker.go:494] system disk:vda1
I0320 19:14:14.456572 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:14:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:14:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:14:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:14:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:14:16.472360 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:14:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:14:23.409769 543705 memory.go:184] no items to output this cycle
I0320 19:14:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 19:14:25.711806 543705 disk_info.go:125] begin check local disk info of client
I0320 19:14:25.714348 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:14:25.714354 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab200 0xc0001ab240]
E0320 19:14:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:14:33.409783 543705 memory.go:184] no items to output this cycle
I0320 19:14:33.409789 543705 cpu.go:275] no items to output this cycle
E0320 19:14:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:14:43.409782 543705 memory.go:191] Add success.
I0320 19:14:43.409785 543705 cpu.go:282] Add success.
I0320 19:14:43.419885 543705 net.go:648] Add success.
I0320 19:14:43.422510 543705 net.go:770] primary dev: ETH0
I0320 19:14:43.422524 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:14:43.422539 543705 net.go:698] Add success.
I0320 19:14:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:14:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:14:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:14:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:14:53.409805 543705 memory.go:184] no items to output this cycle
I0320 19:14:53.409816 543705 cpu.go:275] no items to output this cycle
E0320 19:15:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:15:03.409780 543705 memory.go:184] no items to output this cycle
I0320 19:15:03.409781 543705 cpu.go:275] no items to output this cycle
E0320 19:15:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:15:13.409812 543705 memory.go:191] Add success.
I0320 19:15:13.409818 543705 cpu.go:282] Add success.
W0320 19:15:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:15:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:15:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:15:13.420099 543705 net.go:648] Add success.
I0320 19:15:13.422644 543705 net.go:770] primary dev: ETH0
I0320 19:15:13.422657 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:15:13.422670 543705 net.go:698] Add success.
I0320 19:15:13.477139 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"eb006d29-f71c-4ad9-986c-ecfe00b999ba","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 19:15:13.477174 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 19:15:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:15:14.455347 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:15:14.455447 543705 disk_worker.go:708] disk space is not compliant
W0320 19:15:14.455457 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:15:14.457541 543705 disk_worker.go:494] system disk:vda1
I0320 19:15:14.457570 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:15:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:15:16.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:15:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:15:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:15:16.472411 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:15:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:15:23.409769 543705 memory.go:184] no items to output this cycle
I0320 19:15:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 19:15:25.715215 543705 disk_info.go:125] begin check local disk info of client
I0320 19:15:25.717690 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:15:25.717696 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa0c0 0xc0001aa100]
E0320 19:15:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:15:33.409775 543705 memory.go:184] no items to output this cycle
I0320 19:15:33.409801 543705 cpu.go:275] no items to output this cycle
I0320 19:15:38.595692 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 19:15:38.595699 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 19:15:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:15:43.409788 543705 memory.go:191] Add success.
I0320 19:15:43.409796 543705 cpu.go:282] Add success.
I0320 19:15:43.419889 543705 net.go:648] Add success.
I0320 19:15:43.420854 543705 net.go:770] primary dev: ETH0
I0320 19:15:43.420868 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:15:43.420880 543705 net.go:698] Add success.
I0320 19:15:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:15:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:15:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:15:53.409802 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:15:53.409827 543705 memory.go:184] no items to output this cycle
I0320 19:15:53.409845 543705 cpu.go:275] no items to output this cycle
E0320 19:16:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:16:03.409794 543705 memory.go:184] no items to output this cycle
I0320 19:16:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 19:16:13.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:16:13.409835 543705 memory.go:191] Add success.
I0320 19:16:13.409838 543705 cpu.go:282] Add success.
W0320 19:16:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:16:13.409885 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:16:13.409889 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:16:13.420123 543705 net.go:648] Add success.
I0320 19:16:13.422855 543705 net.go:770] primary dev: ETH0
I0320 19:16:13.422870 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:16:13.422884 543705 net.go:698] Add success.
I0320 19:16:14.454951 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:16:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:16:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 19:16:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:16:14.456582 543705 disk_worker.go:494] system disk:vda1
I0320 19:16:14.456610 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:16:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:16:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:16:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:16:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:16:16.472407 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:16:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:16:23.409783 543705 memory.go:184] no items to output this cycle
I0320 19:16:23.409789 543705 cpu.go:275] no items to output this cycle
I0320 19:16:25.717778 543705 disk_info.go:125] begin check local disk info of client
I0320 19:16:25.720271 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:16:25.720278 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aabc0 0xc0001aac00]
E0320 19:16:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:16:33.409788 543705 memory.go:184] no items to output this cycle
I0320 19:16:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 19:16:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:16:43.409797 543705 memory.go:191] Add success.
I0320 19:16:43.409799 543705 cpu.go:282] Add success.
I0320 19:16:43.419903 543705 net.go:648] Add success.
I0320 19:16:43.422653 543705 net.go:770] primary dev: ETH0
I0320 19:16:43.422666 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:16:43.422678 543705 net.go:698] Add success.
I0320 19:16:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:16:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:16:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:16:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:16:53.409782 543705 memory.go:184] no items to output this cycle
I0320 19:16:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 19:17:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:17:03.409793 543705 memory.go:184] no items to output this cycle
I0320 19:17:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 19:17:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:17:13.409801 543705 cpu.go:282] Add success.
I0320 19:17:13.409809 543705 memory.go:191] Add success.
W0320 19:17:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:17:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:17:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:17:13.420142 543705 net.go:648] Add success.
I0320 19:17:13.422892 543705 net.go:770] primary dev: ETH0
I0320 19:17:13.422905 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:17:13.422916 543705 net.go:698] Add success.
I0320 19:17:13.453448 543705 event_worker.go:152] Polling the log file for events...
W0320 19:17:14.455169 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:17:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 19:17:14.455181 543705 disk_worker.go:728] disk inode is not compliant
E0320 19:17:14.456115 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 19:17:14.456124 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 19:17:14.456129 543705 custom_config.go:64] query custom config with name: gpu
I0320 19:17:14.456556 543705 disk_worker.go:494] system disk:vda1
I0320 19:17:14.456587 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 19:17:15.456854 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 19:17:15.456863 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:17:16.457958 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 19:17:16.457966 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 19:17:16.458009 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:17:16.458025 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:17:16.472367 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:17:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:17:23.409792 543705 memory.go:184] no items to output this cycle
I0320 19:17:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 19:17:25.721250 543705 disk_info.go:125] begin check local disk info of client
I0320 19:17:25.723663 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:17:25.723669 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001e6700 0xc0001e6740]
E0320 19:17:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:17:33.409781 543705 memory.go:184] no items to output this cycle
I0320 19:17:33.409783 543705 cpu.go:275] no items to output this cycle
E0320 19:17:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:17:43.409787 543705 cpu.go:282] Add success.
I0320 19:17:43.409787 543705 memory.go:191] Add success.
I0320 19:17:43.419993 543705 net.go:648] Add success.
I0320 19:17:43.423164 543705 net.go:770] primary dev: ETH0
I0320 19:17:43.423180 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:17:43.423194 543705 net.go:698] Add success.
I0320 19:17:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:17:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:17:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:17:53.409892 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:17:53.409927 543705 memory.go:184] no items to output this cycle
I0320 19:17:53.409963 543705 cpu.go:275] no items to output this cycle
E0320 19:18:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:18:03.409788 543705 cpu.go:275] no items to output this cycle
I0320 19:18:03.409797 543705 memory.go:184] no items to output this cycle
E0320 19:18:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:18:13.409791 543705 memory.go:191] Add success.
I0320 19:18:13.409804 543705 cpu.go:282] Add success.
W0320 19:18:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:18:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:18:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:18:13.420158 543705 net.go:648] Add success.
I0320 19:18:13.422741 543705 net.go:770] primary dev: ETH0
I0320 19:18:13.422754 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:18:13.422766 543705 net.go:698] Add success.
I0320 19:18:13.464560 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d9e3eccc-1821-482b-a668-1786be526a2a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 19:18:13.464602 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 19:18:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:18:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:18:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 19:18:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:18:14.456550 543705 disk_worker.go:494] system disk:vda1
I0320 19:18:14.456582 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:18:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:18:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:18:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:18:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:18:16.472410 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:18:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:18:23.409769 543705 memory.go:184] no items to output this cycle
I0320 19:18:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 19:18:25.725261 543705 disk_info.go:125] begin check local disk info of client
I0320 19:18:25.727749 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:18:25.727754 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb7c0 0xc0001fb800]
E0320 19:18:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:18:33.409788 543705 memory.go:184] no items to output this cycle
I0320 19:18:33.409789 543705 cpu.go:275] no items to output this cycle
I0320 19:18:38.596693 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 19:18:38.596699 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 19:18:43.409886 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:18:43.410603 543705 memory.go:191] Add success.
I0320 19:18:43.409900 543705 cpu.go:282] Add success.
I0320 19:18:43.419726 543705 net.go:648] Add success.
I0320 19:18:43.422238 543705 net.go:770] primary dev: ETH0
I0320 19:18:43.422252 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:18:43.422264 543705 net.go:698] Add success.
I0320 19:18:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:18:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:18:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:18:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:18:53.409777 543705 memory.go:184] no items to output this cycle
I0320 19:18:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 19:19:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:19:03.409804 543705 memory.go:184] no items to output this cycle
I0320 19:19:03.409809 543705 cpu.go:275] no items to output this cycle
E0320 19:19:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:19:13.409823 543705 memory.go:191] Add success.
I0320 19:19:13.409829 543705 cpu.go:282] Add success.
W0320 19:19:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:19:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:19:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:19:13.420122 543705 net.go:648] Add success.
I0320 19:19:13.422715 543705 net.go:770] primary dev: ETH0
I0320 19:19:13.422730 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:19:13.422745 543705 net.go:698] Add success.
I0320 19:19:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:19:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:19:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 19:19:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:19:14.456570 543705 disk_worker.go:494] system disk:vda1
I0320 19:19:14.456601 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:19:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:19:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:19:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:19:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:19:16.472387 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:19:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:19:23.409800 543705 memory.go:184] no items to output this cycle
I0320 19:19:23.409813 543705 cpu.go:275] no items to output this cycle
I0320 19:19:25.727836 543705 disk_info.go:125] begin check local disk info of client
I0320 19:19:25.730302 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:19:25.730308 543705 disk_info.go:196] parse disk info done, disk is : [0xc00057c8c0 0xc00057c900]
E0320 19:19:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:19:33.409775 543705 memory.go:184] no items to output this cycle
I0320 19:19:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 19:19:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:19:43.409815 543705 memory.go:191] Add success.
I0320 19:19:43.409823 543705 cpu.go:282] Add success.
I0320 19:19:43.419958 543705 net.go:648] Add success.
I0320 19:19:43.423146 543705 net.go:770] primary dev: ETH0
I0320 19:19:43.423160 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:19:43.423171 543705 net.go:698] Add success.
I0320 19:19:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:19:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:19:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:19:53.410382 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:19:53.410403 543705 memory.go:184] no items to output this cycle
I0320 19:19:53.410416 543705 cpu.go:275] no items to output this cycle
E0320 19:20:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:20:03.409787 543705 memory.go:184] no items to output this cycle
I0320 19:20:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 19:20:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:20:13.409778 543705 memory.go:191] Add success.
W0320 19:20:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 19:20:13.409808 543705 cpu.go:282] Add success.
W0320 19:20:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:20:13.409818 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:20:13.420104 543705 net.go:648] Add success.
I0320 19:20:13.423141 543705 net.go:770] primary dev: ETH0
I0320 19:20:13.423156 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:20:13.423170 543705 net.go:698] Add success.
I0320 19:20:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:20:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:20:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0320 19:20:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:20:14.456557 543705 disk_worker.go:494] system disk:vda1
I0320 19:20:14.456588 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:20:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:20:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:20:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:20:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:20:16.472451 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:20:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:20:23.409792 543705 memory.go:184] no items to output this cycle
I0320 19:20:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 19:20:25.731297 543705 disk_info.go:125] begin check local disk info of client
I0320 19:20:25.733797 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:20:25.733804 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003fe680 0xc0003fe6c0]
E0320 19:20:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:20:33.409767 543705 memory.go:184] no items to output this cycle
I0320 19:20:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 19:20:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:20:43.409806 543705 memory.go:191] Add success.
I0320 19:20:43.409814 543705 cpu.go:282] Add success.
I0320 19:20:43.419707 543705 net.go:648] Add success.
I0320 19:20:43.422626 543705 net.go:770] primary dev: ETH0
I0320 19:20:43.422639 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:20:43.422651 543705 net.go:698] Add success.
I0320 19:20:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:20:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:20:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:20:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:20:53.409783 543705 memory.go:184] no items to output this cycle
I0320 19:20:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 19:21:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:21:03.409798 543705 memory.go:184] no items to output this cycle
I0320 19:21:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 19:21:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:21:13.409793 543705 memory.go:191] Add success.
I0320 19:21:13.409794 543705 cpu.go:282] Add success.
W0320 19:21:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:21:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:21:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:21:13.420186 543705 net.go:648] Add success.
I0320 19:21:13.422829 543705 net.go:770] primary dev: ETH0
I0320 19:21:13.422843 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:21:13.422854 543705 net.go:698] Add success.
I0320 19:21:13.463644 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"29489938-0bb7-42eb-843c-6919b9f767d4","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 19:21:13.463679 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 19:21:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:21:14.455183 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:21:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0320 19:21:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:21:14.456514 543705 disk_worker.go:494] system disk:vda1
I0320 19:21:14.456558 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:21:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:21:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:21:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:21:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:21:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:21:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:21:23.409762 543705 memory.go:184] no items to output this cycle
I0320 19:21:23.409789 543705 cpu.go:275] no items to output this cycle
I0320 19:21:25.735314 543705 disk_info.go:125] begin check local disk info of client
I0320 19:21:25.737800 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:21:25.737807 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002950c0 0xc000295100]
E0320 19:21:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:21:33.409776 543705 memory.go:184] no items to output this cycle
I0320 19:21:33.409787 543705 cpu.go:275] no items to output this cycle
I0320 19:21:38.596839 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 19:21:38.596846 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 19:21:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:21:43.410766 543705 memory.go:191] Add success.
I0320 19:21:43.409951 543705 cpu.go:282] Add success.
I0320 19:21:43.419744 543705 net.go:648] Add success.
I0320 19:21:43.422503 543705 net.go:770] primary dev: ETH0
I0320 19:21:43.422517 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:21:43.422531 543705 net.go:698] Add success.
I0320 19:21:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:21:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:21:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:21:53.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:21:53.409811 543705 memory.go:184] no items to output this cycle
I0320 19:21:53.409822 543705 cpu.go:275] no items to output this cycle
E0320 19:22:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:22:03.409795 543705 memory.go:184] no items to output this cycle
I0320 19:22:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 19:22:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:22:13.409784 543705 memory.go:191] Add success.
I0320 19:22:13.409784 543705 cpu.go:282] Add success.
W0320 19:22:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:22:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:22:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:22:13.420161 543705 net.go:648] Add success.
I0320 19:22:13.423032 543705 net.go:770] primary dev: ETH0
I0320 19:22:13.423047 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:22:13.423060 543705 net.go:698] Add success.
W0320 19:22:14.455168 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:22:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0320 19:22:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:22:14.456798 543705 disk_worker.go:494] system disk:vda1
I0320 19:22:14.456837 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 19:22:14.457123 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 19:22:14.457130 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 19:22:14.457135 543705 custom_config.go:64] query custom config with name: gpu
E0320 19:22:15.456864 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 19:22:15.456873 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:22:16.457942 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 19:22:16.457951 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 19:22:16.457996 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:22:16.458012 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:22:16.472355 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:22:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:22:23.409778 543705 memory.go:184] no items to output this cycle
I0320 19:22:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 19:22:25.739331 543705 disk_info.go:125] begin check local disk info of client
I0320 19:22:25.741787 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:22:25.741793 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a8600 0xc0004a8640]
E0320 19:22:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:22:33.409795 543705 memory.go:184] no items to output this cycle
I0320 19:22:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 19:22:43.409925 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:22:43.409925 543705 cpu.go:282] Add success.
I0320 19:22:43.410074 543705 memory.go:191] Add success.
I0320 19:22:43.419732 543705 net.go:648] Add success.
I0320 19:22:43.422400 543705 net.go:770] primary dev: ETH0
I0320 19:22:43.422415 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:22:43.422430 543705 net.go:698] Add success.
I0320 19:22:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:22:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:22:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:22:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:22:53.409785 543705 memory.go:184] no items to output this cycle
I0320 19:22:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 19:23:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:23:03.409778 543705 memory.go:184] no items to output this cycle
I0320 19:23:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 19:23:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:23:13.409791 543705 memory.go:191] Add success.
W0320 19:23:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 19:23:13.409818 543705 cpu.go:282] Add success.
W0320 19:23:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:23:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:23:13.420161 543705 net.go:648] Add success.
I0320 19:23:13.422775 543705 net.go:770] primary dev: ETH0
I0320 19:23:13.422788 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:23:13.422800 543705 net.go:698] Add success.
I0320 19:23:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:23:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:23:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 19:23:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:23:14.456586 543705 disk_worker.go:494] system disk:vda1
I0320 19:23:14.456616 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:23:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:23:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:23:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:23:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:23:16.472404 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:23:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:23:23.409786 543705 cpu.go:275] no items to output this cycle
I0320 19:23:23.409791 543705 memory.go:184] no items to output this cycle
I0320 19:23:25.743356 543705 disk_info.go:125] begin check local disk info of client
I0320 19:23:25.745806 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:23:25.745813 543705 disk_info.go:196] parse disk info done, disk is : [0xc000312680 0xc0003126c0]
E0320 19:23:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:23:33.409802 543705 memory.go:184] no items to output this cycle
I0320 19:23:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 19:23:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:23:43.409778 543705 memory.go:191] Add success.
I0320 19:23:43.409799 543705 cpu.go:282] Add success.
I0320 19:23:43.419849 543705 net.go:648] Add success.
I0320 19:23:43.422581 543705 net.go:770] primary dev: ETH0
I0320 19:23:43.422594 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:23:43.422604 543705 net.go:698] Add success.
I0320 19:23:46.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:23:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:23:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:23:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:23:53.409792 543705 memory.go:184] no items to output this cycle
I0320 19:23:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 19:24:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:24:03.409777 543705 memory.go:184] no items to output this cycle
I0320 19:24:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 19:24:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:24:13.409812 543705 memory.go:191] Add success.
I0320 19:24:13.409816 543705 cpu.go:282] Add success.
W0320 19:24:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:24:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:24:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:24:13.420067 543705 net.go:648] Add success.
I0320 19:24:13.422591 543705 net.go:770] primary dev: ETH0
I0320 19:24:13.422605 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:24:13.422617 543705 net.go:698] Add success.
I0320 19:24:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:24:14.455187 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:24:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 19:24:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:24:14.456599 543705 disk_worker.go:494] system disk:vda1
I0320 19:24:14.456631 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:24:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:24:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:24:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:24:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:24:16.472389 543705 disk_local_worker.go:436] Get disk info: []
I0320 19:24:16.736348 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"770da961-9abd-4ec9-88a2-c910e4e36c7c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 19:24:16.736385 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
E0320 19:24:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:24:23.409777 543705 memory.go:184] no items to output this cycle
I0320 19:24:23.409805 543705 cpu.go:275] no items to output this cycle
I0320 19:24:25.745896 543705 disk_info.go:125] begin check local disk info of client
I0320 19:24:25.748381 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:24:25.748388 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a4ac0 0xc0004a4b00]
E0320 19:24:33.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:24:33.409762 543705 memory.go:184] no items to output this cycle
I0320 19:24:33.409802 543705 cpu.go:275] no items to output this cycle
I0320 19:24:38.597000 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 19:24:38.597006 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 19:24:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:24:43.409804 543705 cpu.go:282] Add success.
I0320 19:24:43.410722 543705 memory.go:191] Add success.
I0320 19:24:43.419718 543705 net.go:648] Add success.
I0320 19:24:43.422332 543705 net.go:770] primary dev: ETH0
I0320 19:24:43.422348 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:24:43.422362 543705 net.go:698] Add success.
I0320 19:24:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:24:46.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:24:46.458055 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:24:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:24:53.409773 543705 memory.go:184] no items to output this cycle
I0320 19:24:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 19:25:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:25:03.409776 543705 memory.go:184] no items to output this cycle
I0320 19:25:03.409780 543705 cpu.go:275] no items to output this cycle
E0320 19:25:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:25:13.409790 543705 memory.go:191] Add success.
I0320 19:25:13.409806 543705 cpu.go:282] Add success.
W0320 19:25:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:25:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:25:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:25:13.420048 543705 net.go:648] Add success.
I0320 19:25:13.422814 543705 net.go:770] primary dev: ETH0
I0320 19:25:13.422826 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:25:13.422838 543705 net.go:698] Add success.
I0320 19:25:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:25:14.455142 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:25:14.455153 543705 disk_worker.go:708] disk space is not compliant
W0320 19:25:14.455156 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:25:14.456466 543705 disk_worker.go:494] system disk:vda1
I0320 19:25:14.456509 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:25:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:25:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:25:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:25:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:25:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:25:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:25:23.409807 543705 memory.go:184] no items to output this cycle
I0320 19:25:23.409829 543705 cpu.go:275] no items to output this cycle
I0320 19:25:25.748472 543705 disk_info.go:125] begin check local disk info of client
I0320 19:25:25.751033 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:25:25.751038 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035fb80 0xc00035fbc0]
E0320 19:25:33.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:25:33.409761 543705 memory.go:184] no items to output this cycle
I0320 19:25:33.409803 543705 cpu.go:275] no items to output this cycle
E0320 19:25:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:25:43.409804 543705 memory.go:191] Add success.
I0320 19:25:43.409811 543705 cpu.go:282] Add success.
I0320 19:25:43.420004 543705 net.go:648] Add success.
I0320 19:25:43.422707 543705 net.go:770] primary dev: ETH0
I0320 19:25:43.422722 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:25:43.422736 543705 net.go:698] Add success.
I0320 19:25:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:25:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:25:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:25:53.410232 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:25:53.410257 543705 memory.go:184] no items to output this cycle
I0320 19:25:53.410270 543705 cpu.go:275] no items to output this cycle
E0320 19:26:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:26:03.409796 543705 memory.go:184] no items to output this cycle
I0320 19:26:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 19:26:13.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:26:13.409831 543705 memory.go:191] Add success.
I0320 19:26:13.409835 543705 cpu.go:282] Add success.
W0320 19:26:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:26:13.409879 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:26:13.409883 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:26:13.420136 543705 net.go:648] Add success.
I0320 19:26:13.423020 543705 net.go:770] primary dev: ETH0
I0320 19:26:13.423033 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:26:13.423045 543705 net.go:698] Add success.
I0320 19:26:14.454599 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:26:14.454828 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:26:14.454838 543705 disk_worker.go:708] disk space is not compliant
W0320 19:26:14.454840 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:26:14.456215 543705 disk_worker.go:494] system disk:vda1
I0320 19:26:14.456245 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:26:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:26:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:26:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:26:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:26:16.472445 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:26:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:26:23.409795 543705 memory.go:184] no items to output this cycle
I0320 19:26:23.409797 543705 cpu.go:275] no items to output this cycle
I0320 19:26:25.752402 543705 disk_info.go:125] begin check local disk info of client
I0320 19:26:25.754919 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:26:25.754926 543705 disk_info.go:196] parse disk info done, disk is : [0xc000577800 0xc000577840]
E0320 19:26:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:26:33.409796 543705 memory.go:184] no items to output this cycle
I0320 19:26:33.409797 543705 cpu.go:275] no items to output this cycle
E0320 19:26:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:26:43.409819 543705 memory.go:191] Add success.
I0320 19:26:43.409827 543705 cpu.go:282] Add success.
I0320 19:26:43.419691 543705 net.go:770] primary dev: ETH0
I0320 19:26:43.419703 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:26:43.419718 543705 net.go:698] Add success.
I0320 19:26:43.420112 543705 net.go:648] Add success.
I0320 19:26:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:26:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:26:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:26:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:26:53.409791 543705 memory.go:184] no items to output this cycle
I0320 19:26:53.409820 543705 cpu.go:275] no items to output this cycle
E0320 19:27:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:27:03.409777 543705 memory.go:184] no items to output this cycle
I0320 19:27:03.409781 543705 cpu.go:275] no items to output this cycle
E0320 19:27:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:27:13.409827 543705 memory.go:191] Add success.
I0320 19:27:13.409835 543705 cpu.go:282] Add success.
W0320 19:27:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:27:13.409876 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:27:13.409880 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:27:13.420153 543705 net.go:648] Add success.
I0320 19:27:13.422643 543705 net.go:770] primary dev: ETH0
I0320 19:27:13.422658 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:27:13.422672 543705 net.go:698] Add success.
I0320 19:27:13.428995 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 19:27:13.453168 543705 event_worker.go:152] Polling the log file for events...
I0320 19:27:13.900592 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f27a3251-ed18-43e8-8532-ce8eb2fd8a5d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 19:27:13.900628 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 19:27:14.454159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:27:14.454231 543705 disk_worker.go:708] disk space is not compliant
W0320 19:27:14.454234 543705 disk_worker.go:728] disk inode is not compliant
E0320 19:27:14.454949 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 19:27:14.454959 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 19:27:14.454964 543705 custom_config.go:64] query custom config with name: gpu
I0320 19:27:14.455760 543705 disk_worker.go:494] system disk:vda1
I0320 19:27:14.455790 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 19:27:15.456801 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 19:27:15.456820 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:27:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 19:27:16.457978 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 19:27:16.458021 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:27:16.458039 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:27:16.472381 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:27:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:27:23.409782 543705 cpu.go:275] no items to output this cycle
I0320 19:27:23.409782 543705 memory.go:184] no items to output this cycle
I0320 19:27:25.756410 543705 disk_info.go:125] begin check local disk info of client
I0320 19:27:25.758920 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:27:25.758926 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a6fc0 0xc0004a7000]
E0320 19:27:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:27:33.409777 543705 memory.go:184] no items to output this cycle
I0320 19:27:33.409789 543705 cpu.go:275] no items to output this cycle
I0320 19:27:38.597697 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 19:27:38.597703 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 19:27:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:27:43.410721 543705 memory.go:191] Add success.
I0320 19:27:43.409816 543705 cpu.go:282] Add success.
I0320 19:27:43.419544 543705 net.go:770] primary dev: ETH0
I0320 19:27:43.419557 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:27:43.419570 543705 net.go:698] Add success.
I0320 19:27:43.419794 543705 net.go:648] Add success.
I0320 19:27:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:27:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:27:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:27:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:27:53.409790 543705 memory.go:184] no items to output this cycle
I0320 19:27:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 19:28:03.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:28:03.409817 543705 memory.go:184] no items to output this cycle
I0320 19:28:03.409824 543705 cpu.go:275] no items to output this cycle
E0320 19:28:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:28:13.409810 543705 memory.go:191] Add success.
I0320 19:28:13.409816 543705 cpu.go:282] Add success.
W0320 19:28:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:28:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:28:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:28:13.420154 543705 net.go:648] Add success.
I0320 19:28:13.422922 543705 net.go:770] primary dev: ETH0
I0320 19:28:13.422935 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:28:13.422947 543705 net.go:698] Add success.
I0320 19:28:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:28:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:28:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 19:28:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:28:14.456590 543705 disk_worker.go:494] system disk:vda1
I0320 19:28:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:28:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:28:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:28:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:28:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:28:16.472395 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:28:23.410206 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:28:23.410223 543705 memory.go:184] no items to output this cycle
I0320 19:28:23.410228 543705 cpu.go:275] no items to output this cycle
I0320 19:28:25.759010 543705 disk_info.go:125] begin check local disk info of client
I0320 19:28:25.761494 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:28:25.761499 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b1dc0 0xc0002b1e00]
E0320 19:28:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:28:33.409780 543705 memory.go:184] no items to output this cycle
I0320 19:28:33.409783 543705 cpu.go:275] no items to output this cycle
E0320 19:28:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:28:43.409780 543705 memory.go:191] Add success.
I0320 19:28:43.409805 543705 cpu.go:282] Add success.
I0320 19:28:43.419990 543705 net.go:648] Add success.
I0320 19:28:43.422708 543705 net.go:770] primary dev: ETH0
I0320 19:28:43.422721 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:28:43.422733 543705 net.go:698] Add success.
I0320 19:28:46.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:28:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:28:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:28:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:28:53.409780 543705 memory.go:184] no items to output this cycle
I0320 19:28:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 19:29:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:29:03.409801 543705 memory.go:184] no items to output this cycle
I0320 19:29:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 19:29:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:29:13.409815 543705 memory.go:191] Add success.
I0320 19:29:13.409821 543705 cpu.go:282] Add success.
W0320 19:29:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:29:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:29:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:29:13.420135 543705 net.go:648] Add success.
I0320 19:29:13.422730 543705 net.go:770] primary dev: ETH0
I0320 19:29:13.422743 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:29:13.422754 543705 net.go:698] Add success.
I0320 19:29:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:29:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:29:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0320 19:29:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:29:14.456494 543705 disk_worker.go:494] system disk:vda1
I0320 19:29:14.456538 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:29:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:29:16.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:29:16.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:29:16.458084 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:29:16.472431 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:29:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:29:23.409781 543705 memory.go:184] no items to output this cycle
I0320 19:29:23.409784 543705 cpu.go:275] no items to output this cycle
I0320 19:29:25.761668 543705 disk_info.go:125] begin check local disk info of client
I0320 19:29:25.764187 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:29:25.764193 543705 disk_info.go:196] parse disk info done, disk is : [0xc00025ad40 0xc00025ad80]
E0320 19:29:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:29:33.409801 543705 memory.go:184] no items to output this cycle
I0320 19:29:33.409817 543705 cpu.go:275] no items to output this cycle
E0320 19:29:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:29:43.409782 543705 memory.go:191] Add success.
I0320 19:29:43.409805 543705 cpu.go:282] Add success.
I0320 19:29:43.419905 543705 net.go:648] Add success.
I0320 19:29:43.422471 543705 net.go:770] primary dev: ETH0
I0320 19:29:43.422484 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:29:43.422660 543705 net.go:698] Add success.
I0320 19:29:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:29:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:29:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:29:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:29:53.409778 543705 memory.go:184] no items to output this cycle
I0320 19:29:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 19:30:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:30:03.409785 543705 memory.go:184] no items to output this cycle
I0320 19:30:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 19:30:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:30:13.409812 543705 memory.go:191] Add success.
I0320 19:30:13.409824 543705 cpu.go:282] Add success.
W0320 19:30:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:30:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:30:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:30:13.420277 543705 net.go:648] Add success.
I0320 19:30:13.423163 543705 net.go:770] primary dev: ETH0
I0320 19:30:13.423176 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:30:13.423188 543705 net.go:698] Add success.
I0320 19:30:13.550442 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"db30b28a-4715-4e7a-9719-019a6b46133d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 19:30:13.550476 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 19:30:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:30:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:30:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 19:30:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:30:14.456517 543705 disk_worker.go:494] system disk:vda1
I0320 19:30:14.456562 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:30:15.455975 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:30:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:30:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:30:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:30:16.472470 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:30:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:30:23.409779 543705 memory.go:184] no items to output this cycle
I0320 19:30:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 19:30:25.764275 543705 disk_info.go:125] begin check local disk info of client
I0320 19:30:25.766758 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:30:25.766764 543705 disk_info.go:196] parse disk info done, disk is : [0xc000272640 0xc000272680]
E0320 19:30:33.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:30:33.409761 543705 memory.go:184] no items to output this cycle
I0320 19:30:33.409800 543705 cpu.go:275] no items to output this cycle
I0320 19:30:38.598547 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 19:30:38.598553 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 19:30:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:30:43.410638 543705 memory.go:191] Add success.
I0320 19:30:43.409814 543705 cpu.go:282] Add success.
I0320 19:30:43.420365 543705 net.go:648] Add success.
I0320 19:30:43.422705 543705 net.go:770] primary dev: ETH0
I0320 19:30:43.422719 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:30:43.422732 543705 net.go:698] Add success.
I0320 19:30:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:30:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:30:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:30:53.409846 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:30:53.409868 543705 memory.go:184] no items to output this cycle
I0320 19:30:53.409996 543705 cpu.go:275] no items to output this cycle
E0320 19:31:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:31:03.409773 543705 memory.go:184] no items to output this cycle
I0320 19:31:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 19:31:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:31:13.409819 543705 memory.go:191] Add success.
I0320 19:31:13.409823 543705 cpu.go:282] Add success.
W0320 19:31:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:31:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:31:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:31:13.420305 543705 net.go:648] Add success.
I0320 19:31:13.422860 543705 net.go:770] primary dev: ETH0
I0320 19:31:13.422874 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:31:13.422885 543705 net.go:698] Add success.
I0320 19:31:14.454997 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:31:14.455154 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:31:14.455259 543705 disk_worker.go:708] disk space is not compliant
W0320 19:31:14.455264 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:31:14.457247 543705 disk_worker.go:494] system disk:vda1
I0320 19:31:14.457297 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:31:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:31:16.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:31:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:31:16.458074 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:31:16.472475 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:31:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:31:23.409778 543705 memory.go:184] no items to output this cycle
I0320 19:31:23.409795 543705 cpu.go:275] no items to output this cycle
I0320 19:31:25.768477 543705 disk_info.go:125] begin check local disk info of client
I0320 19:31:25.770977 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:31:25.770984 543705 disk_info.go:196] parse disk info done, disk is : [0xc00046dcc0 0xc00046dd00]
E0320 19:31:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:31:33.409775 543705 memory.go:184] no items to output this cycle
I0320 19:31:33.409800 543705 cpu.go:275] no items to output this cycle
E0320 19:31:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:31:43.409812 543705 memory.go:191] Add success.
I0320 19:31:43.409819 543705 cpu.go:282] Add success.
I0320 19:31:43.419851 543705 net.go:648] Add success.
I0320 19:31:43.422515 543705 net.go:770] primary dev: ETH0
I0320 19:31:43.422529 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:31:43.422544 543705 net.go:698] Add success.
I0320 19:31:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:31:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:31:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:31:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:31:53.409781 543705 memory.go:184] no items to output this cycle
I0320 19:31:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 19:32:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:32:03.409778 543705 memory.go:184] no items to output this cycle
I0320 19:32:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 19:32:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:32:13.409782 543705 memory.go:191] Add success.
W0320 19:32:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 19:32:13.409809 543705 cpu.go:282] Add success.
W0320 19:32:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:32:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:32:13.420133 543705 net.go:648] Add success.
I0320 19:32:13.423125 543705 net.go:770] primary dev: ETH0
I0320 19:32:13.423139 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:32:13.423151 543705 net.go:698] Add success.
W0320 19:32:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:32:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0320 19:32:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:32:14.456819 543705 disk_worker.go:494] system disk:vda1
I0320 19:32:14.456860 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 19:32:14.457177 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 19:32:14.457185 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 19:32:14.457190 543705 custom_config.go:64] query custom config with name: gpu
E0320 19:32:15.456852 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 19:32:15.456862 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:32:16.457929 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 19:32:16.457937 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 19:32:16.457981 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:32:16.457997 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:32:16.472322 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:32:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:32:23.409764 543705 memory.go:184] no items to output this cycle
I0320 19:32:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 19:32:25.771084 543705 disk_info.go:125] begin check local disk info of client
I0320 19:32:25.773862 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:32:25.773870 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b18c0 0xc0003b1900]
E0320 19:32:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:32:33.409812 543705 memory.go:184] no items to output this cycle
I0320 19:32:33.409826 543705 cpu.go:275] no items to output this cycle
E0320 19:32:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:32:43.409774 543705 memory.go:191] Add success.
I0320 19:32:43.409811 543705 cpu.go:282] Add success.
I0320 19:32:43.419820 543705 net.go:648] Add success.
I0320 19:32:43.422480 543705 net.go:770] primary dev: ETH0
I0320 19:32:43.422492 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:32:43.422505 543705 net.go:698] Add success.
I0320 19:32:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:32:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:32:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:32:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:32:53.409788 543705 memory.go:184] no items to output this cycle
I0320 19:32:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 19:33:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:33:03.409781 543705 memory.go:184] no items to output this cycle
I0320 19:33:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 19:33:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:33:13.409827 543705 memory.go:191] Add success.
I0320 19:33:13.409828 543705 cpu.go:282] Add success.
W0320 19:33:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:33:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:33:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:33:13.420189 543705 net.go:648] Add success.
I0320 19:33:13.423407 543705 net.go:770] primary dev: ETH0
I0320 19:33:13.423424 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:33:13.423450 543705 net.go:698] Add success.
I0320 19:33:13.468730 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f40a441d-ae88-42a2-a88f-a4264cf4148f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 19:33:13.468763 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 19:33:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:33:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:33:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 19:33:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:33:14.456526 543705 disk_worker.go:494] system disk:vda1
I0320 19:33:14.456570 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:33:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:33:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:33:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:33:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:33:16.472368 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:33:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:33:23.409778 543705 cpu.go:275] no items to output this cycle
I0320 19:33:23.409792 543705 memory.go:184] no items to output this cycle
I0320 19:33:25.775506 543705 disk_info.go:125] begin check local disk info of client
I0320 19:33:25.778002 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:33:25.778007 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002666c0 0xc000266700]
E0320 19:33:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:33:33.409798 543705 memory.go:184] no items to output this cycle
I0320 19:33:33.409805 543705 cpu.go:275] no items to output this cycle
I0320 19:33:38.598694 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 19:33:38.598700 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 19:33:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:33:43.410651 543705 memory.go:191] Add success.
I0320 19:33:43.409827 543705 cpu.go:282] Add success.
I0320 19:33:43.420358 543705 net.go:648] Add success.
I0320 19:33:43.422816 543705 net.go:770] primary dev: ETH0
I0320 19:33:43.422831 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:33:43.422845 543705 net.go:698] Add success.
I0320 19:33:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:33:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:33:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:33:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:33:53.409784 543705 memory.go:184] no items to output this cycle
I0320 19:33:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 19:34:03.409922 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:34:03.409961 543705 memory.go:184] no items to output this cycle
I0320 19:34:03.409989 543705 cpu.go:275] no items to output this cycle
E0320 19:34:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:34:13.409785 543705 memory.go:191] Add success.
W0320 19:34:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 19:34:13.409819 543705 cpu.go:282] Add success.
W0320 19:34:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:34:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:34:13.420179 543705 net.go:648] Add success.
I0320 19:34:13.423134 543705 net.go:770] primary dev: ETH0
I0320 19:34:13.423148 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:34:13.423161 543705 net.go:698] Add success.
I0320 19:34:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:34:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:34:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 19:34:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:34:14.456591 543705 disk_worker.go:494] system disk:vda1
I0320 19:34:14.456621 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:34:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:34:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:34:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:34:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:34:16.472407 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:34:23.410416 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:34:23.410434 543705 memory.go:184] no items to output this cycle
I0320 19:34:23.410443 543705 cpu.go:275] no items to output this cycle
I0320 19:34:25.779536 543705 disk_info.go:125] begin check local disk info of client
I0320 19:34:25.782090 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:34:25.782096 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b640 0xc00007b680]
E0320 19:34:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:34:33.409768 543705 memory.go:184] no items to output this cycle
I0320 19:34:33.409802 543705 cpu.go:275] no items to output this cycle
E0320 19:34:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:34:43.409795 543705 memory.go:191] Add success.
I0320 19:34:43.409796 543705 cpu.go:282] Add success.
I0320 19:34:43.420054 543705 net.go:648] Add success.
I0320 19:34:43.422766 543705 net.go:770] primary dev: ETH0
I0320 19:34:43.422778 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:34:43.422792 543705 net.go:698] Add success.
I0320 19:34:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:34:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:34:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:34:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:34:53.409810 543705 memory.go:184] no items to output this cycle
I0320 19:34:53.409823 543705 cpu.go:275] no items to output this cycle
E0320 19:35:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:35:03.409767 543705 memory.go:184] no items to output this cycle
I0320 19:35:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 19:35:13.409940 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:35:13.409977 543705 memory.go:191] Add success.
W0320 19:35:13.410018 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 19:35:13.409941 543705 cpu.go:282] Add success.
W0320 19:35:13.410033 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:35:13.410036 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:35:13.419755 543705 net.go:648] Add success.
I0320 19:35:13.422316 543705 net.go:770] primary dev: ETH0
I0320 19:35:13.422329 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:35:13.422343 543705 net.go:698] Add success.
I0320 19:35:14.454978 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:35:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:35:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 19:35:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:35:14.456507 543705 disk_worker.go:494] system disk:vda1
I0320 19:35:14.456551 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:35:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:35:16.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:35:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:35:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:35:16.472411 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:35:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:35:23.409785 543705 memory.go:184] no items to output this cycle
I0320 19:35:23.409787 543705 cpu.go:275] no items to output this cycle
I0320 19:35:25.782180 543705 disk_info.go:125] begin check local disk info of client
I0320 19:35:25.784705 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:35:25.784711 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb740 0xc0001fb780]
E0320 19:35:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:35:33.409775 543705 memory.go:184] no items to output this cycle
I0320 19:35:33.409804 543705 cpu.go:275] no items to output this cycle
E0320 19:35:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:35:43.409804 543705 memory.go:191] Add success.
I0320 19:35:43.409812 543705 cpu.go:282] Add success.
I0320 19:35:43.419875 543705 net.go:648] Add success.
I0320 19:35:43.422649 543705 net.go:770] primary dev: ETH0
I0320 19:35:43.422663 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:35:43.422674 543705 net.go:698] Add success.
I0320 19:35:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:35:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:35:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:35:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:35:53.409795 543705 memory.go:184] no items to output this cycle
I0320 19:35:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 19:36:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:36:03.409791 543705 memory.go:184] no items to output this cycle
I0320 19:36:03.409843 543705 cpu.go:275] no items to output this cycle
E0320 19:36:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:36:13.409813 543705 memory.go:191] Add success.
I0320 19:36:13.409814 543705 cpu.go:282] Add success.
W0320 19:36:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:36:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:36:13.409856 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:36:13.420504 543705 net.go:648] Add success.
I0320 19:36:13.423339 543705 net.go:770] primary dev: ETH0
I0320 19:36:13.423352 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:36:13.423364 543705 net.go:698] Add success.
I0320 19:36:13.469121 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"62748330-f46e-4263-9458-eb95c428701f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 19:36:13.469152 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 19:36:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:36:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:36:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 19:36:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:36:14.456510 543705 disk_worker.go:494] system disk:vda1
I0320 19:36:14.456550 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:36:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:36:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:36:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:36:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:36:16.472383 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:36:23.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:36:23.409807 543705 memory.go:184] no items to output this cycle
I0320 19:36:23.409818 543705 cpu.go:275] no items to output this cycle
I0320 19:36:25.785671 543705 disk_info.go:125] begin check local disk info of client
I0320 19:36:25.788093 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:36:25.788099 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039c700 0xc00039c740]
E0320 19:36:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:36:33.409765 543705 memory.go:184] no items to output this cycle
I0320 19:36:33.409814 543705 cpu.go:275] no items to output this cycle
I0320 19:36:38.599705 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 19:36:38.599712 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 19:36:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:36:43.410521 543705 memory.go:191] Add success.
I0320 19:36:43.409812 543705 cpu.go:282] Add success.
I0320 19:36:43.420270 543705 net.go:648] Add success.
I0320 19:36:43.422914 543705 net.go:770] primary dev: ETH0
I0320 19:36:43.422926 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:36:43.422939 543705 net.go:698] Add success.
I0320 19:36:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:36:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:36:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:36:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:36:53.409779 543705 memory.go:184] no items to output this cycle
I0320 19:36:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 19:37:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:37:03.409800 543705 memory.go:184] no items to output this cycle
I0320 19:37:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 19:37:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:37:13.409819 543705 memory.go:191] Add success.
I0320 19:37:13.409821 543705 cpu.go:282] Add success.
W0320 19:37:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:37:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:37:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:37:13.420195 543705 net.go:648] Add success.
I0320 19:37:13.422825 543705 net.go:770] primary dev: ETH0
I0320 19:37:13.422840 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:37:13.422854 543705 net.go:698] Add success.
I0320 19:37:13.453427 543705 event_worker.go:152] Polling the log file for events...
W0320 19:37:14.455399 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:37:14.455415 543705 disk_worker.go:708] disk space is not compliant
W0320 19:37:14.455419 543705 disk_worker.go:728] disk inode is not compliant
E0320 19:37:14.456135 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 19:37:14.456144 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 19:37:14.456150 543705 custom_config.go:64] query custom config with name: gpu
I0320 19:37:14.457336 543705 disk_worker.go:494] system disk:vda1
I0320 19:37:14.457361 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 19:37:15.456775 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 19:37:15.456784 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:37:16.457899 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 19:37:16.457899 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 19:37:16.457952 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:37:16.457970 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:37:16.472270 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:37:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:37:23.409790 543705 memory.go:184] no items to output this cycle
I0320 19:37:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 19:37:25.788180 543705 disk_info.go:125] begin check local disk info of client
I0320 19:37:25.790627 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:37:25.790635 543705 disk_info.go:196] parse disk info done, disk is : [0xc000314000 0xc000314040]
E0320 19:37:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:37:33.409798 543705 memory.go:184] no items to output this cycle
I0320 19:37:33.409832 543705 cpu.go:275] no items to output this cycle
E0320 19:37:43.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:37:43.409779 543705 memory.go:191] Add success.
I0320 19:37:43.409815 543705 cpu.go:282] Add success.
I0320 19:37:43.420011 543705 net.go:648] Add success.
I0320 19:37:43.422891 543705 net.go:770] primary dev: ETH0
I0320 19:37:43.422906 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:37:43.422920 543705 net.go:698] Add success.
I0320 19:37:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:37:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:37:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:37:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:37:53.409791 543705 memory.go:184] no items to output this cycle
I0320 19:37:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 19:38:03.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:38:03.409816 543705 memory.go:184] no items to output this cycle
I0320 19:38:03.409831 543705 cpu.go:275] no items to output this cycle
E0320 19:38:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:38:13.409798 543705 memory.go:191] Add success.
I0320 19:38:13.409804 543705 cpu.go:282] Add success.
W0320 19:38:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:38:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:38:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:38:13.420046 543705 net.go:648] Add success.
I0320 19:38:13.422903 543705 net.go:770] primary dev: ETH0
I0320 19:38:13.422918 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:38:13.422931 543705 net.go:698] Add success.
I0320 19:38:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:38:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:38:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 19:38:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:38:14.456503 543705 disk_worker.go:494] system disk:vda1
I0320 19:38:14.456547 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:38:15.456011 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:38:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:38:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:38:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:38:16.472379 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:38:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:38:23.409778 543705 memory.go:184] no items to output this cycle
I0320 19:38:23.409782 543705 cpu.go:275] no items to output this cycle
I0320 19:38:25.791591 543705 disk_info.go:125] begin check local disk info of client
I0320 19:38:25.794061 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:38:25.794067 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa440 0xc0001fa480]
E0320 19:38:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:38:33.409771 543705 memory.go:184] no items to output this cycle
I0320 19:38:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 19:38:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:38:43.409815 543705 memory.go:191] Add success.
I0320 19:38:43.409825 543705 cpu.go:282] Add success.
I0320 19:38:43.419989 543705 net.go:648] Add success.
I0320 19:38:43.422957 543705 net.go:770] primary dev: ETH0
I0320 19:38:43.422970 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:38:43.422982 543705 net.go:698] Add success.
I0320 19:38:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:38:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:38:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:38:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:38:53.409777 543705 memory.go:184] no items to output this cycle
I0320 19:38:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 19:39:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:39:03.409776 543705 memory.go:184] no items to output this cycle
I0320 19:39:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 19:39:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:39:13.409797 543705 memory.go:191] Add success.
I0320 19:39:13.409798 543705 cpu.go:282] Add success.
W0320 19:39:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:39:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:39:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:39:13.420238 543705 net.go:648] Add success.
I0320 19:39:13.423067 543705 net.go:770] primary dev: ETH0
I0320 19:39:13.423079 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:39:13.423091 543705 net.go:698] Add success.
I0320 19:39:13.683216 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"317c83fe-b034-4e43-a328-fa89ca39c0b1","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 19:39:13.683256 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 19:39:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:39:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:39:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 19:39:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:39:14.456612 543705 disk_worker.go:494] system disk:vda1
I0320 19:39:14.456645 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:39:15.455977 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:39:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:39:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:39:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:39:16.472373 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:39:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:39:23.409799 543705 memory.go:184] no items to output this cycle
I0320 19:39:23.409808 543705 cpu.go:275] no items to output this cycle
I0320 19:39:25.794148 543705 disk_info.go:125] begin check local disk info of client
I0320 19:39:25.796610 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:39:25.796616 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b140 0xc00007b180]
E0320 19:39:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:39:33.409797 543705 memory.go:184] no items to output this cycle
I0320 19:39:33.409812 543705 cpu.go:275] no items to output this cycle
I0320 19:39:38.599855 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 19:39:38.599861 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 19:39:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:39:43.410639 543705 memory.go:191] Add success.
I0320 19:39:43.409798 543705 cpu.go:282] Add success.
I0320 19:39:43.420312 543705 net.go:648] Add success.
I0320 19:39:43.422718 543705 net.go:770] primary dev: ETH0
I0320 19:39:43.422730 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:39:43.422742 543705 net.go:698] Add success.
I0320 19:39:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:39:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:39:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:39:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:39:53.409804 543705 memory.go:184] no items to output this cycle
I0320 19:39:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 19:40:03.410718 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:40:03.410736 543705 memory.go:184] no items to output this cycle
I0320 19:40:03.410767 543705 cpu.go:275] no items to output this cycle
E0320 19:40:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:40:13.409814 543705 memory.go:191] Add success.
I0320 19:40:13.409822 543705 cpu.go:282] Add success.
W0320 19:40:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:40:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:40:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:40:13.420068 543705 net.go:648] Add success.
I0320 19:40:13.422665 543705 net.go:770] primary dev: ETH0
I0320 19:40:13.422681 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:40:13.422694 543705 net.go:698] Add success.
I0320 19:40:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:40:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:40:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0320 19:40:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:40:14.456578 543705 disk_worker.go:494] system disk:vda1
I0320 19:40:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:40:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:40:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:40:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:40:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:40:16.472394 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:40:23.409843 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:40:23.409860 543705 memory.go:184] no items to output this cycle
I0320 19:40:23.409952 543705 cpu.go:275] no items to output this cycle
I0320 19:40:25.797622 543705 disk_info.go:125] begin check local disk info of client
I0320 19:40:25.800122 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:40:25.800128 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035c500 0xc00035c540]
E0320 19:40:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:40:33.409781 543705 cpu.go:275] no items to output this cycle
I0320 19:40:33.409786 543705 memory.go:184] no items to output this cycle
E0320 19:40:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:40:43.409809 543705 memory.go:191] Add success.
I0320 19:40:43.409820 543705 cpu.go:282] Add success.
I0320 19:40:43.419900 543705 net.go:648] Add success.
I0320 19:40:43.422712 543705 net.go:770] primary dev: ETH0
I0320 19:40:43.422727 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:40:43.422743 543705 net.go:698] Add success.
I0320 19:40:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:40:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:40:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:40:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:40:53.409777 543705 memory.go:184] no items to output this cycle
I0320 19:40:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 19:41:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:41:03.409797 543705 memory.go:184] no items to output this cycle
I0320 19:41:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 19:41:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:41:13.409783 543705 memory.go:191] Add success.
I0320 19:41:13.409795 543705 cpu.go:282] Add success.
W0320 19:41:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:41:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:41:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:41:13.420146 543705 net.go:648] Add success.
I0320 19:41:13.423113 543705 net.go:770] primary dev: ETH0
I0320 19:41:13.423126 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:41:13.423137 543705 net.go:698] Add success.
I0320 19:41:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:41:14.455214 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:41:14.455224 543705 disk_worker.go:708] disk space is not compliant
W0320 19:41:14.455227 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:41:14.456582 543705 disk_worker.go:494] system disk:vda1
I0320 19:41:14.456628 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:41:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:41:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:41:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:41:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:41:16.472363 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:41:23.410737 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:41:23.410750 543705 memory.go:184] no items to output this cycle
I0320 19:41:23.410753 543705 cpu.go:275] no items to output this cycle
I0320 19:41:25.801652 543705 disk_info.go:125] begin check local disk info of client
I0320 19:41:25.804085 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:41:25.804090 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004cb9c0 0xc0004cba00]
E0320 19:41:33.409881 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:41:33.409898 543705 memory.go:184] no items to output this cycle
I0320 19:41:33.409965 543705 cpu.go:275] no items to output this cycle
E0320 19:41:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:41:43.409804 543705 memory.go:191] Add success.
I0320 19:41:43.409833 543705 cpu.go:282] Add success.
I0320 19:41:43.419910 543705 net.go:648] Add success.
I0320 19:41:43.422847 543705 net.go:770] primary dev: ETH0
I0320 19:41:43.422863 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:41:43.422876 543705 net.go:698] Add success.
I0320 19:41:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:41:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:41:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:41:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:41:53.409796 543705 memory.go:184] no items to output this cycle
I0320 19:41:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 19:42:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:42:03.409794 543705 memory.go:184] no items to output this cycle
I0320 19:42:03.409828 543705 cpu.go:275] no items to output this cycle
E0320 19:42:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:42:13.409791 543705 memory.go:191] Add success.
W0320 19:42:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:42:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:42:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:42:13.409835 543705 cpu.go:282] Add success.
I0320 19:42:13.420122 543705 net.go:648] Add success.
I0320 19:42:13.423069 543705 net.go:770] primary dev: ETH0
I0320 19:42:13.423081 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:42:13.423094 543705 net.go:698] Add success.
I0320 19:42:13.468820 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"85e163f5-b92a-415a-ac53-2ef3ea1b2feb","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 19:42:13.468855 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 19:42:14.455158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:42:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0320 19:42:14.455171 543705 disk_worker.go:728] disk inode is not compliant
E0320 19:42:14.456825 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 19:42:14.456833 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 19:42:14.456838 543705 custom_config.go:64] query custom config with name: gpu
I0320 19:42:14.457106 543705 disk_worker.go:494] system disk:vda1
I0320 19:42:14.457132 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 19:42:15.456840 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 19:42:15.456849 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:42:16.457941 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 19:42:16.457951 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 19:42:16.457992 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:42:16.458009 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:42:16.472330 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:42:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:42:23.409790 543705 memory.go:184] no items to output this cycle
I0320 19:42:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 19:42:25.805667 543705 disk_info.go:125] begin check local disk info of client
I0320 19:42:25.808160 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:42:25.808166 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bcb40 0xc0002bcb80]
E0320 19:42:33.409876 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:42:33.409895 543705 memory.go:184] no items to output this cycle
I0320 19:42:33.409977 543705 cpu.go:275] no items to output this cycle
I0320 19:42:38.600702 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 19:42:38.600709 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 19:42:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:42:43.410647 543705 memory.go:191] Add success.
I0320 19:42:43.409827 543705 cpu.go:282] Add success.
I0320 19:42:43.420342 543705 net.go:648] Add success.
I0320 19:42:43.422951 543705 net.go:770] primary dev: ETH0
I0320 19:42:43.422963 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:42:43.422976 543705 net.go:698] Add success.
I0320 19:42:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:42:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:42:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:42:53.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:42:53.409816 543705 memory.go:184] no items to output this cycle
I0320 19:42:53.409825 543705 cpu.go:275] no items to output this cycle
E0320 19:43:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:43:03.409775 543705 memory.go:184] no items to output this cycle
I0320 19:43:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 19:43:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:43:13.409786 543705 memory.go:191] Add success.
I0320 19:43:13.409808 543705 cpu.go:282] Add success.
W0320 19:43:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:43:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:43:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:43:13.420304 543705 net.go:648] Add success.
I0320 19:43:13.423122 543705 net.go:770] primary dev: ETH0
I0320 19:43:13.423135 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:43:13.423148 543705 net.go:698] Add success.
I0320 19:43:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:43:14.455166 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:43:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0320 19:43:14.455180 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:43:14.456568 543705 disk_worker.go:494] system disk:vda1
I0320 19:43:14.456597 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:43:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:43:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:43:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:43:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:43:16.472411 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:43:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:43:23.409780 543705 memory.go:184] no items to output this cycle
I0320 19:43:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 19:43:25.809670 543705 disk_info.go:125] begin check local disk info of client
I0320 19:43:25.812117 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:43:25.812122 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003f43c0 0xc0003f4400]
E0320 19:43:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:43:33.409874 543705 memory.go:184] no items to output this cycle
I0320 19:43:33.409952 543705 cpu.go:275] no items to output this cycle
E0320 19:43:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:43:43.409797 543705 memory.go:191] Add success.
I0320 19:43:43.409798 543705 cpu.go:282] Add success.
I0320 19:43:43.420003 543705 net.go:648] Add success.
I0320 19:43:43.422750 543705 net.go:770] primary dev: ETH0
I0320 19:43:43.422762 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:43:43.422774 543705 net.go:698] Add success.
I0320 19:43:46.457995 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:43:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:43:46.458089 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:43:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:43:53.409778 543705 memory.go:184] no items to output this cycle
I0320 19:43:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 19:44:03.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:44:03.409816 543705 memory.go:184] no items to output this cycle
I0320 19:44:03.409827 543705 cpu.go:275] no items to output this cycle
E0320 19:44:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:44:13.409785 543705 memory.go:191] Add success.
I0320 19:44:13.409800 543705 cpu.go:282] Add success.
W0320 19:44:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:44:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:44:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:44:13.420043 543705 net.go:648] Add success.
I0320 19:44:13.422589 543705 net.go:770] primary dev: ETH0
I0320 19:44:13.422602 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:44:13.422614 543705 net.go:698] Add success.
I0320 19:44:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:44:14.455160 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:44:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0320 19:44:14.455173 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:44:14.456514 543705 disk_worker.go:494] system disk:vda1
I0320 19:44:14.456558 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:44:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:44:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:44:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:44:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:44:16.472374 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:44:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:44:23.409792 543705 memory.go:184] no items to output this cycle
I0320 19:44:23.409805 543705 cpu.go:275] no items to output this cycle
I0320 19:44:25.813678 543705 disk_info.go:125] begin check local disk info of client
I0320 19:44:25.816170 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:44:25.816178 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bb640 0xc0002bb680]
E0320 19:44:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:44:33.409778 543705 memory.go:184] no items to output this cycle
I0320 19:44:33.409779 543705 cpu.go:275] no items to output this cycle
E0320 19:44:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:44:43.409890 543705 cpu.go:282] Add success.
I0320 19:44:43.409914 543705 memory.go:191] Add success.
I0320 19:44:43.419705 543705 net.go:648] Add success.
I0320 19:44:43.422302 543705 net.go:770] primary dev: ETH0
I0320 19:44:43.422315 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:44:43.422327 543705 net.go:698] Add success.
I0320 19:44:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:44:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:44:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:44:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:44:53.409777 543705 memory.go:184] no items to output this cycle
I0320 19:44:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 19:45:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:45:03.409785 543705 cpu.go:275] no items to output this cycle
I0320 19:45:03.409789 543705 memory.go:184] no items to output this cycle
E0320 19:45:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:45:13.409815 543705 memory.go:191] Add success.
I0320 19:45:13.409821 543705 cpu.go:282] Add success.
W0320 19:45:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:45:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:45:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:45:13.420138 543705 net.go:648] Add success.
I0320 19:45:13.423518 543705 net.go:770] primary dev: ETH0
I0320 19:45:13.423531 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:45:13.423543 543705 net.go:698] Add success.
I0320 19:45:13.463592 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"05d743c6-c312-4f23-9cdb-8aa741791dfd","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 19:45:13.463627 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 19:45:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:45:14.455095 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:45:14.455156 543705 disk_worker.go:708] disk space is not compliant
W0320 19:45:14.455159 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:45:14.456484 543705 disk_worker.go:494] system disk:vda1
I0320 19:45:14.456526 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:45:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:45:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:45:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:45:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:45:16.472411 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:45:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:45:23.409773 543705 memory.go:184] no items to output this cycle
I0320 19:45:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 19:45:25.817674 543705 disk_info.go:125] begin check local disk info of client
I0320 19:45:25.820117 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:45:25.820123 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003f4e40 0xc0003f4e80]
E0320 19:45:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:45:33.409793 543705 memory.go:184] no items to output this cycle
I0320 19:45:33.409809 543705 cpu.go:275] no items to output this cycle
I0320 19:45:38.601714 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 19:45:38.601721 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 19:45:43.409874 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:45:43.410670 543705 memory.go:191] Add success.
I0320 19:45:43.409923 543705 cpu.go:282] Add success.
I0320 19:45:43.419732 543705 net.go:648] Add success.
I0320 19:45:43.422170 543705 net.go:770] primary dev: ETH0
I0320 19:45:43.422184 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:45:43.422197 543705 net.go:698] Add success.
I0320 19:45:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:45:46.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:45:46.458055 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:45:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:45:53.409788 543705 cpu.go:275] no items to output this cycle
I0320 19:45:53.409790 543705 memory.go:184] no items to output this cycle
E0320 19:46:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:46:03.409790 543705 memory.go:184] no items to output this cycle
I0320 19:46:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 19:46:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:46:13.409784 543705 cpu.go:282] Add success.
I0320 19:46:13.409787 543705 memory.go:191] Add success.
W0320 19:46:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:46:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:46:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:46:13.420111 543705 net.go:648] Add success.
I0320 19:46:13.422931 543705 net.go:770] primary dev: ETH0
I0320 19:46:13.422944 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:46:13.422958 543705 net.go:698] Add success.
I0320 19:46:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:46:14.455194 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:46:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 19:46:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:46:14.456582 543705 disk_worker.go:494] system disk:vda1
I0320 19:46:14.456612 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:46:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:46:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:46:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:46:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:46:16.472400 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:46:23.410248 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:46:23.410264 543705 memory.go:184] no items to output this cycle
I0320 19:46:23.410288 543705 cpu.go:275] no items to output this cycle
I0320 19:46:25.821676 543705 disk_info.go:125] begin check local disk info of client
I0320 19:46:25.824126 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:46:25.824131 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b28c0 0xc0003b2900]
E0320 19:46:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:46:33.409795 543705 memory.go:184] no items to output this cycle
I0320 19:46:33.409809 543705 cpu.go:275] no items to output this cycle
E0320 19:46:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:46:43.409789 543705 memory.go:191] Add success.
I0320 19:46:43.409789 543705 cpu.go:282] Add success.
I0320 19:46:43.419754 543705 net.go:648] Add success.
I0320 19:46:43.422315 543705 net.go:770] primary dev: ETH0
I0320 19:46:43.422327 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:46:43.422338 543705 net.go:698] Add success.
I0320 19:46:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:46:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:46:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:46:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:46:53.409792 543705 memory.go:184] no items to output this cycle
I0320 19:46:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 19:47:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:47:03.409779 543705 memory.go:184] no items to output this cycle
I0320 19:47:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 19:47:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:47:13.409812 543705 memory.go:191] Add success.
I0320 19:47:13.409823 543705 cpu.go:282] Add success.
W0320 19:47:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:47:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:47:13.409863 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:47:13.420262 543705 net.go:648] Add success.
I0320 19:47:13.422989 543705 net.go:770] primary dev: ETH0
I0320 19:47:13.423004 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:47:13.423018 543705 net.go:698] Add success.
I0320 19:47:13.453574 543705 event_worker.go:152] Polling the log file for events...
W0320 19:47:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:47:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 19:47:14.455189 543705 disk_worker.go:728] disk inode is not compliant
E0320 19:47:14.455854 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 19:47:14.455863 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 19:47:14.455868 543705 custom_config.go:64] query custom config with name: gpu
I0320 19:47:14.456559 543705 disk_worker.go:494] system disk:vda1
I0320 19:47:14.456589 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 19:47:15.456845 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 19:47:15.456854 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:47:16.457943 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 19:47:16.457953 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 19:47:16.457997 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:47:16.458012 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:47:16.472337 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:47:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:47:23.409769 543705 memory.go:184] no items to output this cycle
I0320 19:47:23.409787 543705 cpu.go:275] no items to output this cycle
I0320 19:47:25.825672 543705 disk_info.go:125] begin check local disk info of client
I0320 19:47:25.828098 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:47:25.828103 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004876c0 0xc000487700]
E0320 19:47:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:47:33.409795 543705 memory.go:184] no items to output this cycle
I0320 19:47:33.409809 543705 cpu.go:275] no items to output this cycle
E0320 19:47:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:47:43.409776 543705 memory.go:191] Add success.
I0320 19:47:43.409800 543705 cpu.go:282] Add success.
I0320 19:47:43.419711 543705 net.go:648] Add success.
I0320 19:47:43.422743 543705 net.go:770] primary dev: ETH0
I0320 19:47:43.422759 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:47:43.422773 543705 net.go:698] Add success.
I0320 19:47:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:47:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:47:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:47:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:47:53.409791 543705 memory.go:184] no items to output this cycle
I0320 19:47:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 19:48:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:48:03.409810 543705 memory.go:184] no items to output this cycle
I0320 19:48:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 19:48:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:48:13.409786 543705 cpu.go:282] Add success.
I0320 19:48:13.409800 543705 memory.go:191] Add success.
W0320 19:48:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:48:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:48:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:48:13.420063 543705 net.go:648] Add success.
I0320 19:48:13.422819 543705 net.go:770] primary dev: ETH0
I0320 19:48:13.422834 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:48:13.422848 543705 net.go:698] Add success.
I0320 19:48:13.471825 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"17686cc3-2298-4cdb-adc4-ec7bf3e790c3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 19:48:13.471857 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 19:48:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:48:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:48:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 19:48:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:48:14.456566 543705 disk_worker.go:494] system disk:vda1
I0320 19:48:14.456598 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:48:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:48:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:48:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:48:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:48:16.472396 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:48:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:48:23.409780 543705 cpu.go:275] no items to output this cycle
I0320 19:48:23.409782 543705 memory.go:184] no items to output this cycle
I0320 19:48:25.829671 543705 disk_info.go:125] begin check local disk info of client
I0320 19:48:25.832129 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:48:25.832134 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b9080 0xc0003b90c0]
E0320 19:48:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:48:33.409776 543705 cpu.go:275] no items to output this cycle
I0320 19:48:33.409781 543705 memory.go:184] no items to output this cycle
I0320 19:48:38.602708 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 19:48:38.602714 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 19:48:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:48:43.409798 543705 cpu.go:282] Add success.
I0320 19:48:43.410752 543705 memory.go:191] Add success.
I0320 19:48:43.419707 543705 net.go:648] Add success.
I0320 19:48:43.422091 543705 net.go:770] primary dev: ETH0
I0320 19:48:43.422104 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:48:43.422115 543705 net.go:698] Add success.
I0320 19:48:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:48:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:48:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:48:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:48:53.409790 543705 memory.go:184] no items to output this cycle
I0320 19:48:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 19:49:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:49:03.409803 543705 memory.go:184] no items to output this cycle
I0320 19:49:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 19:49:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:49:13.409783 543705 memory.go:191] Add success.
I0320 19:49:13.409803 543705 cpu.go:282] Add success.
W0320 19:49:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:49:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:49:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:49:13.420339 543705 net.go:648] Add success.
I0320 19:49:13.423207 543705 net.go:770] primary dev: ETH0
I0320 19:49:13.423220 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:49:13.423232 543705 net.go:698] Add success.
I0320 19:49:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:49:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:49:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 19:49:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:49:14.456584 543705 disk_worker.go:494] system disk:vda1
I0320 19:49:14.456626 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:49:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:49:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:49:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:49:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:49:16.472398 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:49:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:49:23.409765 543705 memory.go:184] no items to output this cycle
I0320 19:49:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 19:49:25.833674 543705 disk_info.go:125] begin check local disk info of client
I0320 19:49:25.836082 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:49:25.836089 543705 disk_info.go:196] parse disk info done, disk is : [0xc000486e00 0xc000486e40]
E0320 19:49:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:49:33.409789 543705 memory.go:184] no items to output this cycle
I0320 19:49:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 19:49:43.409807 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:49:43.409839 543705 memory.go:191] Add success.
I0320 19:49:43.409849 543705 cpu.go:282] Add success.
I0320 19:49:43.420090 543705 net.go:770] primary dev: ETH0
I0320 19:49:43.420108 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:49:43.420122 543705 net.go:698] Add success.
I0320 19:49:43.420680 543705 net.go:648] Add success.
I0320 19:49:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:49:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:49:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:49:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:49:53.409794 543705 memory.go:184] no items to output this cycle
I0320 19:49:53.409816 543705 cpu.go:275] no items to output this cycle
E0320 19:50:03.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:50:03.409823 543705 memory.go:184] no items to output this cycle
I0320 19:50:03.409834 543705 cpu.go:275] no items to output this cycle
E0320 19:50:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:50:13.409788 543705 memory.go:191] Add success.
I0320 19:50:13.409811 543705 cpu.go:282] Add success.
W0320 19:50:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:50:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:50:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:50:13.420261 543705 net.go:648] Add success.
I0320 19:50:13.423009 543705 net.go:770] primary dev: ETH0
I0320 19:50:13.423024 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:50:13.423052 543705 net.go:698] Add success.
I0320 19:50:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:50:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:50:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0320 19:50:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:50:14.456512 543705 disk_worker.go:494] system disk:vda1
I0320 19:50:14.456559 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:50:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:50:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:50:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:50:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:50:16.472383 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:50:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:50:23.409790 543705 memory.go:184] no items to output this cycle
I0320 19:50:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 19:50:25.837669 543705 disk_info.go:125] begin check local disk info of client
I0320 19:50:25.840164 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:50:25.840169 543705 disk_info.go:196] parse disk info done, disk is : [0xc000486e40 0xc000486e80]
E0320 19:50:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:50:33.409792 543705 memory.go:184] no items to output this cycle
I0320 19:50:33.409794 543705 cpu.go:275] no items to output this cycle
E0320 19:50:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:50:43.409792 543705 memory.go:191] Add success.
I0320 19:50:43.409819 543705 cpu.go:282] Add success.
I0320 19:50:43.419888 543705 net.go:648] Add success.
I0320 19:50:43.422690 543705 net.go:770] primary dev: ETH0
I0320 19:50:43.422705 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:50:43.422719 543705 net.go:698] Add success.
I0320 19:50:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:50:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:50:46.458177 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:50:53.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:50:53.409805 543705 cpu.go:275] no items to output this cycle
I0320 19:50:53.409818 543705 memory.go:184] no items to output this cycle
E0320 19:51:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:51:03.409780 543705 memory.go:184] no items to output this cycle
I0320 19:51:03.409780 543705 cpu.go:275] no items to output this cycle
E0320 19:51:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:51:13.409794 543705 memory.go:191] Add success.
I0320 19:51:13.409795 543705 cpu.go:282] Add success.
W0320 19:51:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:51:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:51:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:51:13.420095 543705 net.go:648] Add success.
I0320 19:51:13.422985 543705 net.go:770] primary dev: ETH0
I0320 19:51:13.422999 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:51:13.423010 543705 net.go:698] Add success.
I0320 19:51:13.474951 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a58cbac5-4ab6-43e2-a4c6-f5ef333e89a2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 19:51:13.474992 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 19:51:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:51:14.455167 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:51:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0320 19:51:14.455180 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:51:14.456537 543705 disk_worker.go:494] system disk:vda1
I0320 19:51:14.456562 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:51:15.455614 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:51:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:51:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:51:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:51:16.472375 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:51:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:51:23.409797 543705 memory.go:184] no items to output this cycle
I0320 19:51:23.409805 543705 cpu.go:275] no items to output this cycle
I0320 19:51:25.841672 543705 disk_info.go:125] begin check local disk info of client
I0320 19:51:25.844241 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:51:25.844247 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001f9bc0 0xc0001f9c00]
E0320 19:51:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:51:33.409792 543705 memory.go:184] no items to output this cycle
I0320 19:51:33.409806 543705 cpu.go:275] no items to output this cycle
I0320 19:51:38.603734 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 19:51:38.603742 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 19:51:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:51:43.410598 543705 memory.go:191] Add success.
I0320 19:51:43.409792 543705 cpu.go:282] Add success.
I0320 19:51:43.420412 543705 net.go:648] Add success.
I0320 19:51:43.423198 543705 net.go:770] primary dev: ETH0
I0320 19:51:43.423212 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:51:43.423224 543705 net.go:698] Add success.
I0320 19:51:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:51:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:51:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:51:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:51:53.409792 543705 memory.go:184] no items to output this cycle
I0320 19:51:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 19:52:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:52:03.409788 543705 memory.go:184] no items to output this cycle
I0320 19:52:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 19:52:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:52:13.409807 543705 memory.go:191] Add success.
I0320 19:52:13.409817 543705 cpu.go:282] Add success.
W0320 19:52:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:52:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:52:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:52:13.420149 543705 net.go:648] Add success.
I0320 19:52:13.422856 543705 net.go:770] primary dev: ETH0
I0320 19:52:13.422870 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:52:13.422882 543705 net.go:698] Add success.
W0320 19:52:14.455106 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:52:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0320 19:52:14.455171 543705 disk_worker.go:728] disk inode is not compliant
E0320 19:52:14.456935 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 19:52:14.456944 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 19:52:14.456950 543705 custom_config.go:64] query custom config with name: gpu
I0320 19:52:14.457025 543705 disk_worker.go:494] system disk:vda1
I0320 19:52:14.457069 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 19:52:15.456834 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 19:52:15.456844 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:52:16.457958 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 19:52:16.457968 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 19:52:16.458012 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:52:16.458029 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:52:16.472403 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:52:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:52:23.409774 543705 memory.go:184] no items to output this cycle
I0320 19:52:23.409780 543705 cpu.go:275] no items to output this cycle
I0320 19:52:25.845674 543705 disk_info.go:125] begin check local disk info of client
I0320 19:52:25.848121 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:52:25.848126 543705 disk_info.go:196] parse disk info done, disk is : [0xc000349bc0 0xc000349c00]
E0320 19:52:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:52:33.409775 543705 memory.go:184] no items to output this cycle
I0320 19:52:33.409800 543705 cpu.go:275] no items to output this cycle
E0320 19:52:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:52:43.409786 543705 memory.go:191] Add success.
I0320 19:52:43.409788 543705 cpu.go:282] Add success.
I0320 19:52:43.419922 543705 net.go:648] Add success.
I0320 19:52:43.422811 543705 net.go:770] primary dev: ETH0
I0320 19:52:43.422831 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:52:43.422846 543705 net.go:698] Add success.
I0320 19:52:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:52:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:52:46.458089 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:52:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:52:53.409797 543705 memory.go:184] no items to output this cycle
I0320 19:52:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 19:53:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:53:03.409772 543705 memory.go:184] no items to output this cycle
I0320 19:53:03.409802 543705 cpu.go:275] no items to output this cycle
E0320 19:53:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:53:13.409797 543705 memory.go:191] Add success.
I0320 19:53:13.409804 543705 cpu.go:282] Add success.
W0320 19:53:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:53:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:53:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:53:13.420296 543705 net.go:648] Add success.
I0320 19:53:13.423068 543705 net.go:770] primary dev: ETH0
I0320 19:53:13.423090 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:53:13.423103 543705 net.go:698] Add success.
I0320 19:53:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:53:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:53:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0320 19:53:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:53:14.456600 543705 disk_worker.go:494] system disk:vda1
I0320 19:53:14.456631 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:53:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:53:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:53:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:53:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:53:16.472386 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:53:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:53:23.409775 543705 memory.go:184] no items to output this cycle
I0320 19:53:23.409806 543705 cpu.go:275] no items to output this cycle
I0320 19:53:25.849669 543705 disk_info.go:125] begin check local disk info of client
I0320 19:53:25.852116 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:53:25.852122 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a8700 0xc0002a8740]
E0320 19:53:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:53:33.409802 543705 memory.go:184] no items to output this cycle
I0320 19:53:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 19:53:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:53:43.409815 543705 memory.go:191] Add success.
I0320 19:53:43.409827 543705 cpu.go:282] Add success.
I0320 19:53:43.419877 543705 net.go:648] Add success.
I0320 19:53:43.423004 543705 net.go:770] primary dev: ETH0
I0320 19:53:43.423018 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:53:43.423029 543705 net.go:698] Add success.
I0320 19:53:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:53:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:53:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:53:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:53:53.409814 543705 memory.go:184] no items to output this cycle
I0320 19:53:53.409831 543705 cpu.go:275] no items to output this cycle
E0320 19:54:03.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:54:03.409824 543705 memory.go:184] no items to output this cycle
I0320 19:54:03.409835 543705 cpu.go:275] no items to output this cycle
E0320 19:54:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:54:13.409802 543705 memory.go:191] Add success.
I0320 19:54:13.409806 543705 cpu.go:282] Add success.
W0320 19:54:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:54:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:54:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:54:13.420134 543705 net.go:648] Add success.
I0320 19:54:13.422930 543705 net.go:770] primary dev: ETH0
I0320 19:54:13.422945 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:54:13.422960 543705 net.go:698] Add success.
I0320 19:54:13.586777 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f6c0ec84-fadc-4e55-a9ac-1cf5201cefc7","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 19:54:13.586812 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 19:54:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:54:14.455105 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:54:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0320 19:54:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:54:14.456516 543705 disk_worker.go:494] system disk:vda1
I0320 19:54:14.456562 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:54:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:54:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:54:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:54:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:54:16.472384 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:54:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:54:23.409800 543705 memory.go:184] no items to output this cycle
I0320 19:54:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 19:54:25.853670 543705 disk_info.go:125] begin check local disk info of client
I0320 19:54:25.856158 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:54:25.856165 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a6480 0xc0002a64c0]
E0320 19:54:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:54:33.409797 543705 memory.go:184] no items to output this cycle
I0320 19:54:33.409810 543705 cpu.go:275] no items to output this cycle
I0320 19:54:38.604726 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 19:54:38.604733 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 19:54:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:54:43.410668 543705 memory.go:191] Add success.
I0320 19:54:43.409793 543705 cpu.go:282] Add success.
I0320 19:54:43.420361 543705 net.go:648] Add success.
I0320 19:54:43.423298 543705 net.go:770] primary dev: ETH0
I0320 19:54:43.423311 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:54:43.423325 543705 net.go:698] Add success.
I0320 19:54:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:54:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:54:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:54:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:54:53.409800 543705 memory.go:184] no items to output this cycle
I0320 19:54:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 19:55:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:55:03.409787 543705 memory.go:184] no items to output this cycle
I0320 19:55:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 19:55:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:55:13.409827 543705 memory.go:191] Add success.
I0320 19:55:13.409837 543705 cpu.go:282] Add success.
W0320 19:55:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:55:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:55:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:55:13.420211 543705 net.go:648] Add success.
I0320 19:55:13.423399 543705 net.go:770] primary dev: ETH0
I0320 19:55:13.423413 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:55:13.423425 543705 net.go:698] Add success.
I0320 19:55:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:55:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:55:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0320 19:55:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:55:14.456594 543705 disk_worker.go:494] system disk:vda1
I0320 19:55:14.456625 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:55:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:55:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:55:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:55:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:55:16.472375 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:55:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:55:23.409771 543705 memory.go:184] no items to output this cycle
I0320 19:55:23.409798 543705 cpu.go:275] no items to output this cycle
I0320 19:55:25.857674 543705 disk_info.go:125] begin check local disk info of client
I0320 19:55:25.860111 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:55:25.860116 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb7c0 0xc0001fb800]
E0320 19:55:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:55:33.409781 543705 memory.go:184] no items to output this cycle
I0320 19:55:33.409785 543705 cpu.go:275] no items to output this cycle
E0320 19:55:43.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:55:43.409776 543705 memory.go:191] Add success.
I0320 19:55:43.409809 543705 cpu.go:282] Add success.
I0320 19:55:43.419843 543705 net.go:648] Add success.
I0320 19:55:43.422977 543705 net.go:770] primary dev: ETH0
I0320 19:55:43.422990 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:55:43.423002 543705 net.go:698] Add success.
I0320 19:55:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:55:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:55:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:55:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:55:53.409793 543705 memory.go:184] no items to output this cycle
I0320 19:55:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 19:56:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:56:03.409788 543705 memory.go:184] no items to output this cycle
I0320 19:56:03.409835 543705 cpu.go:275] no items to output this cycle
E0320 19:56:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:56:13.409817 543705 memory.go:191] Add success.
I0320 19:56:13.409828 543705 cpu.go:282] Add success.
W0320 19:56:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:56:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:56:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:56:13.420165 543705 net.go:648] Add success.
I0320 19:56:13.422739 543705 net.go:770] primary dev: ETH0
I0320 19:56:13.422752 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:56:13.422764 543705 net.go:698] Add success.
I0320 19:56:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:56:14.455160 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:56:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 19:56:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:56:14.456498 543705 disk_worker.go:494] system disk:vda1
I0320 19:56:14.456541 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:56:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:56:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:56:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:56:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:56:16.472379 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:56:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:56:23.409764 543705 memory.go:184] no items to output this cycle
I0320 19:56:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 19:56:25.861673 543705 disk_info.go:125] begin check local disk info of client
I0320 19:56:25.864115 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:56:25.864121 543705 disk_info.go:196] parse disk info done, disk is : [0xc00036a380 0xc00036a400]
E0320 19:56:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:56:33.409763 543705 memory.go:184] no items to output this cycle
I0320 19:56:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 19:56:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:56:43.409791 543705 memory.go:191] Add success.
I0320 19:56:43.409794 543705 cpu.go:282] Add success.
I0320 19:56:43.420068 543705 net.go:648] Add success.
I0320 19:56:43.422837 543705 net.go:770] primary dev: ETH0
I0320 19:56:43.422850 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:56:43.422862 543705 net.go:698] Add success.
I0320 19:56:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:56:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:56:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:56:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:56:53.409791 543705 memory.go:184] no items to output this cycle
I0320 19:56:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 19:57:03.409855 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:57:03.409867 543705 cpu.go:275] no items to output this cycle
I0320 19:57:03.409870 543705 memory.go:184] no items to output this cycle
E0320 19:57:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:57:13.409789 543705 memory.go:191] Add success.
I0320 19:57:13.409810 543705 cpu.go:282] Add success.
W0320 19:57:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:57:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:57:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:57:13.420166 543705 net.go:648] Add success.
I0320 19:57:13.429093 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 19:57:13.429171 543705 net.go:770] primary dev: ETH0
I0320 19:57:13.429184 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:57:13.429196 543705 net.go:698] Add success.
I0320 19:57:13.453714 543705 event_worker.go:152] Polling the log file for events...
W0320 19:57:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:57:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 19:57:14.455191 543705 disk_worker.go:728] disk inode is not compliant
E0320 19:57:14.455898 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 19:57:14.455907 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 19:57:14.455913 543705 custom_config.go:64] query custom config with name: gpu
I0320 19:57:14.456547 543705 disk_worker.go:494] system disk:vda1
I0320 19:57:14.456577 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 19:57:15.456805 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 19:57:15.456813 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:57:16.457944 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 19:57:16.457944 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 19:57:16.457998 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:57:16.458018 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:57:16.472330 543705 disk_local_worker.go:436] Get disk info: []
W0320 19:57:18.453958 543705 custom_config.go:80] failed to get custom config
I0320 19:57:18.453977 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
E0320 19:57:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:57:23.409763 543705 memory.go:184] no items to output this cycle
I0320 19:57:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 19:57:25.865670 543705 disk_info.go:125] begin check local disk info of client
I0320 19:57:25.868085 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:57:25.868091 543705 disk_info.go:196] parse disk info done, disk is : [0xc000492d80 0xc000492dc0]
E0320 19:57:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:57:33.409765 543705 memory.go:184] no items to output this cycle
I0320 19:57:33.409795 543705 cpu.go:275] no items to output this cycle
I0320 19:57:38.604876 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 19:57:38.604883 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 19:57:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:57:43.410700 543705 memory.go:191] Add success.
I0320 19:57:43.409797 543705 cpu.go:282] Add success.
I0320 19:57:43.420380 543705 net.go:648] Add success.
I0320 19:57:43.423472 543705 net.go:770] primary dev: ETH0
I0320 19:57:43.423484 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:57:43.423497 543705 net.go:698] Add success.
I0320 19:57:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:57:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:57:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:57:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:57:53.409789 543705 memory.go:184] no items to output this cycle
I0320 19:57:53.409818 543705 cpu.go:275] no items to output this cycle
E0320 19:58:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:58:03.409770 543705 memory.go:184] no items to output this cycle
I0320 19:58:03.409840 543705 cpu.go:275] no items to output this cycle
E0320 19:58:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:58:13.409812 543705 memory.go:191] Add success.
I0320 19:58:13.409813 543705 cpu.go:282] Add success.
W0320 19:58:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 19:58:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:58:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:58:13.420542 543705 net.go:648] Add success.
I0320 19:58:13.423445 543705 net.go:770] primary dev: ETH0
I0320 19:58:13.423460 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:58:13.423474 543705 net.go:698] Add success.
I0320 19:58:14.454981 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:58:14.455185 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:58:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 19:58:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:58:14.456577 543705 disk_worker.go:494] system disk:vda1
I0320 19:58:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:58:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:58:16.458034 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:58:16.458091 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:58:16.458112 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:58:16.472450 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:58:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:58:23.409775 543705 memory.go:184] no items to output this cycle
I0320 19:58:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 19:58:25.869672 543705 disk_info.go:125] begin check local disk info of client
I0320 19:58:25.872125 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:58:25.872131 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fbe40 0xc0001fbe80]
E0320 19:58:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:58:33.409780 543705 memory.go:184] no items to output this cycle
I0320 19:58:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 19:58:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:58:43.409797 543705 memory.go:191] Add success.
I0320 19:58:43.409803 543705 cpu.go:282] Add success.
I0320 19:58:43.419841 543705 net.go:648] Add success.
I0320 19:58:43.422660 543705 net.go:770] primary dev: ETH0
I0320 19:58:43.422673 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:58:43.422685 543705 net.go:698] Add success.
I0320 19:58:46.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:58:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:58:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:58:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:58:53.409803 543705 memory.go:184] no items to output this cycle
I0320 19:58:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 19:59:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:59:03.409806 543705 memory.go:184] no items to output this cycle
I0320 19:59:03.409823 543705 cpu.go:275] no items to output this cycle
E0320 19:59:13.409905 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:59:13.409933 543705 memory.go:191] Add success.
W0320 19:59:13.409971 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 19:59:13.409982 543705 cpu.go:282] Add success.
W0320 19:59:13.409991 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 19:59:13.410000 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 19:59:13.419726 543705 net.go:648] Add success.
I0320 19:59:13.422729 543705 net.go:770] primary dev: ETH0
I0320 19:59:13.422748 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:59:13.422765 543705 net.go:698] Add success.
I0320 19:59:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 19:59:14.455096 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 19:59:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0320 19:59:14.455161 543705 disk_worker.go:728] disk inode is not compliant
I0320 19:59:14.456497 543705 disk_worker.go:494] system disk:vda1
I0320 19:59:14.456540 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 19:59:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 19:59:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:59:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:59:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 19:59:16.472407 543705 disk_local_worker.go:436] Get disk info: []
E0320 19:59:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:59:23.409784 543705 memory.go:184] no items to output this cycle
I0320 19:59:23.409789 543705 cpu.go:275] no items to output this cycle
I0320 19:59:25.873673 543705 disk_info.go:125] begin check local disk info of client
I0320 19:59:25.876124 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 19:59:25.876130 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aab40 0xc0001aab80]
E0320 19:59:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:59:33.409786 543705 memory.go:184] no items to output this cycle
I0320 19:59:33.409793 543705 cpu.go:275] no items to output this cycle
E0320 19:59:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:59:43.409809 543705 cpu.go:282] Add success.
I0320 19:59:43.409811 543705 memory.go:191] Add success.
I0320 19:59:43.419921 543705 net.go:648] Add success.
I0320 19:59:43.422556 543705 net.go:770] primary dev: ETH0
I0320 19:59:43.422568 543705 net.go:802] Send network stats successfully!,count is 6
I0320 19:59:43.422585 543705 net.go:698] Add success.
I0320 19:59:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 19:59:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 19:59:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 19:59:53.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 19:59:53.409816 543705 cpu.go:275] no items to output this cycle
I0320 19:59:53.409822 543705 memory.go:184] no items to output this cycle
E0320 20:00:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:00:03.409807 543705 memory.go:184] no items to output this cycle
I0320 20:00:03.409826 543705 cpu.go:275] no items to output this cycle
E0320 20:00:13.409802 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:00:13.409838 543705 memory.go:191] Add success.
I0320 20:00:13.409838 543705 cpu.go:282] Add success.
W0320 20:00:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:00:13.409879 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:00:13.409882 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:00:13.420511 543705 net.go:648] Add success.
I0320 20:00:13.423050 543705 net.go:770] primary dev: ETH0
I0320 20:00:13.423064 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:00:13.423075 543705 net.go:698] Add success.
I0320 20:00:13.469290 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"dbc86c47-dfe2-4e1e-b5c5-cf469daafc5f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 20:00:13.469321 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 20:00:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:00:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:00:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 20:00:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:00:14.456636 543705 disk_worker.go:494] system disk:vda1
I0320 20:00:14.456666 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:00:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:00:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:00:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:00:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:00:16.472374 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:00:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:00:23.409796 543705 memory.go:184] no items to output this cycle
I0320 20:00:23.409815 543705 cpu.go:275] no items to output this cycle
I0320 20:00:25.877670 543705 disk_info.go:125] begin check local disk info of client
I0320 20:00:25.880235 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:00:25.880241 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004b1180 0xc0004b11c0]
E0320 20:00:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:00:33.409797 543705 memory.go:184] no items to output this cycle
I0320 20:00:33.409808 543705 cpu.go:275] no items to output this cycle
I0320 20:00:38.605732 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 20:00:38.605738 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 20:00:43.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:00:43.410644 543705 memory.go:191] Add success.
I0320 20:00:43.409807 543705 cpu.go:282] Add success.
I0320 20:00:43.420323 543705 net.go:648] Add success.
I0320 20:00:43.422798 543705 net.go:770] primary dev: ETH0
I0320 20:00:43.422811 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:00:43.422823 543705 net.go:698] Add success.
I0320 20:00:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:00:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:00:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:00:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:00:53.409782 543705 memory.go:184] no items to output this cycle
I0320 20:00:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 20:01:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:01:03.409776 543705 memory.go:184] no items to output this cycle
I0320 20:01:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 20:01:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:01:13.409784 543705 memory.go:191] Add success.
I0320 20:01:13.409809 543705 cpu.go:282] Add success.
W0320 20:01:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:01:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:01:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:01:13.420124 543705 net.go:648] Add success.
I0320 20:01:13.423003 543705 net.go:770] primary dev: ETH0
I0320 20:01:13.423018 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:01:13.423033 543705 net.go:698] Add success.
I0320 20:01:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:01:14.455348 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:01:14.455512 543705 disk_worker.go:708] disk space is not compliant
W0320 20:01:14.455516 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:01:14.457510 543705 disk_worker.go:494] system disk:vda1
I0320 20:01:14.457548 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:01:15.456018 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:01:16.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:01:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:01:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:01:16.472391 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:01:23.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:01:23.409765 543705 memory.go:184] no items to output this cycle
I0320 20:01:23.409797 543705 cpu.go:275] no items to output this cycle
I0320 20:01:25.881675 543705 disk_info.go:125] begin check local disk info of client
I0320 20:01:25.884117 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:01:25.884123 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004b12c0 0xc0004b1300]
E0320 20:01:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:01:33.409782 543705 cpu.go:275] no items to output this cycle
I0320 20:01:33.409786 543705 memory.go:184] no items to output this cycle
E0320 20:01:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:01:43.409788 543705 memory.go:191] Add success.
I0320 20:01:43.409804 543705 cpu.go:282] Add success.
I0320 20:01:43.419857 543705 net.go:648] Add success.
I0320 20:01:43.422647 543705 net.go:770] primary dev: ETH0
I0320 20:01:43.422659 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:01:43.422670 543705 net.go:698] Add success.
I0320 20:01:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:01:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:01:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:01:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:01:53.409782 543705 memory.go:184] no items to output this cycle
I0320 20:01:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 20:02:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:02:03.409804 543705 memory.go:184] no items to output this cycle
I0320 20:02:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 20:02:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:02:13.409797 543705 memory.go:191] Add success.
I0320 20:02:13.409796 543705 cpu.go:282] Add success.
W0320 20:02:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:02:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:02:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:02:13.420673 543705 net.go:648] Add success.
I0320 20:02:13.423809 543705 net.go:770] primary dev: ETH0
I0320 20:02:13.423822 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:02:13.423833 543705 net.go:698] Add success.
W0320 20:02:14.455286 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:02:14.455421 543705 disk_worker.go:708] disk space is not compliant
W0320 20:02:14.455426 543705 disk_worker.go:728] disk inode is not compliant
E0320 20:02:14.456286 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 20:02:14.456296 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 20:02:14.456303 543705 custom_config.go:64] query custom config with name: gpu
I0320 20:02:14.457268 543705 disk_worker.go:494] system disk:vda1
I0320 20:02:14.457297 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 20:02:15.456796 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 20:02:15.456804 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:02:16.457919 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 20:02:16.457919 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 20:02:16.457973 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:02:16.457992 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:02:16.472305 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:02:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:02:23.409770 543705 memory.go:184] no items to output this cycle
I0320 20:02:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 20:02:25.885674 543705 disk_info.go:125] begin check local disk info of client
I0320 20:02:25.888109 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:02:25.888116 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034de80 0xc00034dec0]
E0320 20:02:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:02:33.409785 543705 memory.go:184] no items to output this cycle
I0320 20:02:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 20:02:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:02:43.409779 543705 memory.go:191] Add success.
I0320 20:02:43.409815 543705 cpu.go:282] Add success.
I0320 20:02:43.419870 543705 net.go:648] Add success.
I0320 20:02:43.422702 543705 net.go:770] primary dev: ETH0
I0320 20:02:43.422714 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:02:43.422743 543705 net.go:698] Add success.
I0320 20:02:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:02:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:02:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:02:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:02:53.409794 543705 memory.go:184] no items to output this cycle
I0320 20:02:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 20:03:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:03:03.409781 543705 memory.go:184] no items to output this cycle
I0320 20:03:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 20:03:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:03:13.409785 543705 memory.go:191] Add success.
I0320 20:03:13.409803 543705 cpu.go:282] Add success.
W0320 20:03:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:03:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:03:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:03:13.420054 543705 net.go:648] Add success.
I0320 20:03:13.422778 543705 net.go:770] primary dev: ETH0
I0320 20:03:13.422793 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:03:13.422808 543705 net.go:698] Add success.
I0320 20:03:13.463707 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c5fb7e51-612a-4744-aa6f-f51dc1d5b7f5","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 20:03:13.463738 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 20:03:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:03:14.455569 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:03:14.455583 543705 disk_worker.go:708] disk space is not compliant
W0320 20:03:14.455587 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:03:14.457159 543705 disk_worker.go:494] system disk:vda1
I0320 20:03:14.457210 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:03:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:03:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:03:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:03:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:03:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:03:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:03:23.409800 543705 memory.go:184] no items to output this cycle
I0320 20:03:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 20:03:25.889673 543705 disk_info.go:125] begin check local disk info of client
I0320 20:03:25.892138 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:03:25.892144 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004901c0 0xc000490200]
E0320 20:03:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:03:33.409766 543705 memory.go:184] no items to output this cycle
I0320 20:03:33.409802 543705 cpu.go:275] no items to output this cycle
I0320 20:03:38.606736 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 20:03:38.606743 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 20:03:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:03:43.410554 543705 memory.go:191] Add success.
I0320 20:03:43.409855 543705 cpu.go:282] Add success.
I0320 20:03:43.420291 543705 net.go:648] Add success.
I0320 20:03:43.422788 543705 net.go:770] primary dev: ETH0
I0320 20:03:43.422803 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:03:43.422817 543705 net.go:698] Add success.
I0320 20:03:46.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:03:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:03:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:03:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:03:53.409798 543705 memory.go:184] no items to output this cycle
I0320 20:03:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 20:04:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:04:03.409776 543705 memory.go:184] no items to output this cycle
I0320 20:04:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 20:04:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:04:13.409793 543705 memory.go:191] Add success.
I0320 20:04:13.409808 543705 cpu.go:282] Add success.
W0320 20:04:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:04:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:04:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:04:13.420073 543705 net.go:648] Add success.
I0320 20:04:13.422943 543705 net.go:770] primary dev: ETH0
I0320 20:04:13.422957 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:04:13.422971 543705 net.go:698] Add success.
I0320 20:04:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:04:14.455180 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:04:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 20:04:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:04:14.456564 543705 disk_worker.go:494] system disk:vda1
I0320 20:04:14.456595 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:04:15.456006 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:04:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:04:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:04:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:04:16.472383 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:04:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:04:23.409795 543705 memory.go:184] no items to output this cycle
I0320 20:04:23.409805 543705 cpu.go:275] no items to output this cycle
I0320 20:04:25.893672 543705 disk_info.go:125] begin check local disk info of client
I0320 20:04:25.896118 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:04:25.896124 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fba80 0xc0001fbac0]
E0320 20:04:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:04:33.409778 543705 cpu.go:275] no items to output this cycle
I0320 20:04:33.409780 543705 memory.go:184] no items to output this cycle
E0320 20:04:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:04:43.409791 543705 memory.go:191] Add success.
I0320 20:04:43.409813 543705 cpu.go:282] Add success.
I0320 20:04:43.420032 543705 net.go:648] Add success.
I0320 20:04:43.422758 543705 net.go:770] primary dev: ETH0
I0320 20:04:43.422771 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:04:43.422785 543705 net.go:698] Add success.
I0320 20:04:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:04:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:04:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:04:53.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:04:53.409812 543705 memory.go:184] no items to output this cycle
I0320 20:04:53.409823 543705 cpu.go:275] no items to output this cycle
E0320 20:05:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:05:03.409815 543705 memory.go:184] no items to output this cycle
I0320 20:05:03.409829 543705 cpu.go:275] no items to output this cycle
E0320 20:05:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:05:13.409787 543705 memory.go:191] Add success.
I0320 20:05:13.409805 543705 cpu.go:282] Add success.
W0320 20:05:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:05:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:05:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:05:13.420190 543705 net.go:648] Add success.
I0320 20:05:13.423478 543705 net.go:770] primary dev: ETH0
I0320 20:05:13.423492 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:05:13.423505 543705 net.go:698] Add success.
I0320 20:05:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:05:14.455193 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:05:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0320 20:05:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:05:14.456588 543705 disk_worker.go:494] system disk:vda1
I0320 20:05:14.456619 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:05:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:05:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:05:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:05:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:05:16.472893 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:05:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:05:23.409767 543705 memory.go:184] no items to output this cycle
I0320 20:05:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 20:05:25.897674 543705 disk_info.go:125] begin check local disk info of client
I0320 20:05:25.900123 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:05:25.900129 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb540 0xc0001fb580]
E0320 20:05:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:05:33.409765 543705 memory.go:184] no items to output this cycle
I0320 20:05:33.409791 543705 cpu.go:275] no items to output this cycle
E0320 20:05:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:05:43.409786 543705 memory.go:191] Add success.
I0320 20:05:43.409802 543705 cpu.go:282] Add success.
I0320 20:05:43.419875 543705 net.go:648] Add success.
I0320 20:05:43.422643 543705 net.go:770] primary dev: ETH0
I0320 20:05:43.422656 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:05:43.422668 543705 net.go:698] Add success.
I0320 20:05:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:05:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:05:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:05:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:05:53.409784 543705 memory.go:184] no items to output this cycle
I0320 20:05:53.409846 543705 cpu.go:275] no items to output this cycle
E0320 20:06:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:06:03.409806 543705 memory.go:184] no items to output this cycle
I0320 20:06:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 20:06:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:06:13.409824 543705 memory.go:191] Add success.
I0320 20:06:13.409828 543705 cpu.go:282] Add success.
W0320 20:06:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:06:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:06:13.409878 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:06:13.420145 543705 net.go:648] Add success.
I0320 20:06:13.422738 543705 net.go:770] primary dev: ETH0
I0320 20:06:13.422753 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:06:13.422766 543705 net.go:698] Add success.
I0320 20:06:13.643144 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d47868a0-06ed-4b49-ab7d-40f4c3356b3b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 20:06:13.643178 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 20:06:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:06:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:06:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 20:06:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:06:14.456775 543705 disk_worker.go:494] system disk:vda1
I0320 20:06:14.456817 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:06:15.455602 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:06:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:06:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:06:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:06:16.472417 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:06:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:06:23.409781 543705 memory.go:184] no items to output this cycle
I0320 20:06:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 20:06:25.901673 543705 disk_info.go:125] begin check local disk info of client
I0320 20:06:25.904131 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:06:25.904137 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4580 0xc0000c45c0]
E0320 20:06:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:06:33.409781 543705 memory.go:184] no items to output this cycle
I0320 20:06:33.409791 543705 cpu.go:275] no items to output this cycle
I0320 20:06:38.607741 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 20:06:38.607748 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 20:06:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:06:43.409790 543705 memory.go:191] Add success.
I0320 20:06:43.409796 543705 cpu.go:282] Add success.
I0320 20:06:43.420004 543705 net.go:648] Add success.
I0320 20:06:43.420975 543705 net.go:770] primary dev: ETH0
I0320 20:06:43.420991 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:06:43.421005 543705 net.go:698] Add success.
I0320 20:06:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:06:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:06:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:06:53.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:06:53.409818 543705 memory.go:184] no items to output this cycle
I0320 20:06:53.409820 543705 cpu.go:275] no items to output this cycle
E0320 20:07:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:07:03.409788 543705 memory.go:184] no items to output this cycle
I0320 20:07:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 20:07:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:07:13.409815 543705 memory.go:191] Add success.
I0320 20:07:13.409820 543705 cpu.go:282] Add success.
W0320 20:07:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:07:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:07:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:07:13.420147 543705 net.go:648] Add success.
I0320 20:07:13.422818 543705 net.go:770] primary dev: ETH0
I0320 20:07:13.422832 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:07:13.422845 543705 net.go:698] Add success.
I0320 20:07:13.453409 543705 event_worker.go:152] Polling the log file for events...
W0320 20:07:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:07:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 20:07:14.455189 543705 disk_worker.go:728] disk inode is not compliant
E0320 20:07:14.455903 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 20:07:14.455912 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 20:07:14.455918 543705 custom_config.go:64] query custom config with name: gpu
I0320 20:07:14.456568 543705 disk_worker.go:494] system disk:vda1
I0320 20:07:14.456598 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 20:07:15.456816 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 20:07:15.456825 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:07:16.457963 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 20:07:16.457973 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 20:07:16.458017 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:07:16.458033 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:07:16.472393 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:07:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:07:23.409776 543705 memory.go:184] no items to output this cycle
I0320 20:07:23.409781 543705 cpu.go:275] no items to output this cycle
I0320 20:07:25.905675 543705 disk_info.go:125] begin check local disk info of client
I0320 20:07:25.908093 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:07:25.908099 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b2800 0xc0003b2840]
E0320 20:07:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:07:33.409773 543705 memory.go:184] no items to output this cycle
I0320 20:07:33.409794 543705 cpu.go:275] no items to output this cycle
E0320 20:07:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:07:43.409786 543705 memory.go:191] Add success.
I0320 20:07:43.409804 543705 cpu.go:282] Add success.
I0320 20:07:43.419959 543705 net.go:648] Add success.
I0320 20:07:43.422845 543705 net.go:770] primary dev: ETH0
I0320 20:07:43.422858 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:07:43.422870 543705 net.go:698] Add success.
I0320 20:07:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:07:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:07:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:07:53.409805 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:07:53.409825 543705 memory.go:184] no items to output this cycle
I0320 20:07:53.409826 543705 cpu.go:275] no items to output this cycle
E0320 20:08:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:08:03.409805 543705 memory.go:184] no items to output this cycle
I0320 20:08:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 20:08:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:08:13.409786 543705 memory.go:191] Add success.
I0320 20:08:13.409805 543705 cpu.go:282] Add success.
W0320 20:08:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:08:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:08:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:08:13.420199 543705 net.go:648] Add success.
I0320 20:08:13.422918 543705 net.go:770] primary dev: ETH0
I0320 20:08:13.422931 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:08:13.422944 543705 net.go:698] Add success.
I0320 20:08:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:08:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:08:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 20:08:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:08:14.456554 543705 disk_worker.go:494] system disk:vda1
I0320 20:08:14.456586 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:08:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:08:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:08:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:08:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:08:16.472403 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:08:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:08:23.409801 543705 memory.go:184] no items to output this cycle
I0320 20:08:23.409808 543705 cpu.go:275] no items to output this cycle
I0320 20:08:25.909665 543705 disk_info.go:125] begin check local disk info of client
I0320 20:08:25.912141 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:08:25.912147 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004df780 0xc0004df7c0]
E0320 20:08:33.409901 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:08:33.409919 543705 memory.go:184] no items to output this cycle
I0320 20:08:33.409925 543705 cpu.go:275] no items to output this cycle
E0320 20:08:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:08:43.409825 543705 memory.go:191] Add success.
I0320 20:08:43.409829 543705 cpu.go:282] Add success.
I0320 20:08:43.420045 543705 net.go:648] Add success.
I0320 20:08:43.422583 543705 net.go:770] primary dev: ETH0
I0320 20:08:43.422598 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:08:43.422612 543705 net.go:698] Add success.
I0320 20:08:46.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:08:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:08:46.458089 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:08:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 20:08:53.409871 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:08:53.409883 543705 memory.go:184] no items to output this cycle
E0320 20:09:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:09:03.409773 543705 memory.go:184] no items to output this cycle
I0320 20:09:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 20:09:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:09:13.409825 543705 memory.go:191] Add success.
I0320 20:09:13.409826 543705 cpu.go:282] Add success.
W0320 20:09:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:09:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:09:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:09:13.420141 543705 net.go:648] Add success.
I0320 20:09:13.422668 543705 net.go:770] primary dev: ETH0
I0320 20:09:13.422682 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:09:13.422695 543705 net.go:698] Add success.
I0320 20:09:13.468745 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"90169a8b-86b7-49c0-a7bf-555913c58040","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 20:09:13.468779 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 20:09:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:09:14.455151 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:09:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0320 20:09:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:09:14.456484 543705 disk_worker.go:494] system disk:vda1
I0320 20:09:14.456529 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:09:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:09:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:09:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:09:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:09:16.472390 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:09:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:09:23.409767 543705 memory.go:184] no items to output this cycle
I0320 20:09:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 20:09:25.913676 543705 disk_info.go:125] begin check local disk info of client
I0320 20:09:25.916104 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:09:25.916109 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b2400 0xc0003b2440]
E0320 20:09:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:09:33.409769 543705 memory.go:184] no items to output this cycle
I0320 20:09:33.409900 543705 cpu.go:275] no items to output this cycle
I0320 20:09:38.608752 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 20:09:38.608760 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 20:09:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:09:43.410698 543705 memory.go:191] Add success.
I0320 20:09:43.409810 543705 cpu.go:282] Add success.
I0320 20:09:43.420405 543705 net.go:648] Add success.
I0320 20:09:43.423055 543705 net.go:770] primary dev: ETH0
I0320 20:09:43.423068 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:09:43.423081 543705 net.go:698] Add success.
I0320 20:09:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:09:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:09:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:09:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 20:09:53.409811 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:09:53.409830 543705 memory.go:184] no items to output this cycle
E0320 20:10:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:10:03.409786 543705 memory.go:184] no items to output this cycle
I0320 20:10:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 20:10:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:10:13.409798 543705 memory.go:191] Add success.
I0320 20:10:13.409816 543705 cpu.go:282] Add success.
W0320 20:10:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:10:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:10:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:10:13.420328 543705 net.go:648] Add success.
I0320 20:10:13.423081 543705 net.go:770] primary dev: ETH0
I0320 20:10:13.423094 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:10:13.423105 543705 net.go:698] Add success.
I0320 20:10:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:10:14.455149 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:10:14.455159 543705 disk_worker.go:708] disk space is not compliant
W0320 20:10:14.455162 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:10:14.456482 543705 disk_worker.go:494] system disk:vda1
I0320 20:10:14.456525 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:10:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:10:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:10:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:10:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:10:16.472378 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:10:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:10:23.409784 543705 memory.go:184] no items to output this cycle
I0320 20:10:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 20:10:25.917674 543705 disk_info.go:125] begin check local disk info of client
I0320 20:10:25.920107 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:10:25.920112 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003de900 0xc0003de940]
E0320 20:10:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:10:33.409797 543705 memory.go:184] no items to output this cycle
I0320 20:10:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 20:10:43.409922 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:10:43.410022 543705 cpu.go:282] Add success.
I0320 20:10:43.410049 543705 memory.go:191] Add success.
I0320 20:10:43.419707 543705 net.go:648] Add success.
I0320 20:10:43.422452 543705 net.go:770] primary dev: ETH0
I0320 20:10:43.422466 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:10:43.422477 543705 net.go:698] Add success.
I0320 20:10:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:10:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:10:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:10:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:10:53.409795 543705 memory.go:184] no items to output this cycle
I0320 20:10:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 20:11:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:11:03.409777 543705 memory.go:184] no items to output this cycle
I0320 20:11:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 20:11:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:11:13.409835 543705 memory.go:191] Add success.
I0320 20:11:13.409836 543705 cpu.go:282] Add success.
W0320 20:11:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:11:13.409876 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:11:13.409879 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:11:13.420643 543705 net.go:648] Add success.
I0320 20:11:13.423475 543705 net.go:770] primary dev: ETH0
I0320 20:11:13.423489 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:11:13.423504 543705 net.go:698] Add success.
I0320 20:11:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:11:14.455101 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:11:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 20:11:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:11:14.456473 543705 disk_worker.go:494] system disk:vda1
I0320 20:11:14.456519 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:11:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:11:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:11:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:11:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:11:16.472387 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:11:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:11:23.409777 543705 memory.go:184] no items to output this cycle
I0320 20:11:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 20:11:25.921673 543705 disk_info.go:125] begin check local disk info of client
I0320 20:11:25.924135 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:11:25.924142 543705 disk_info.go:196] parse disk info done, disk is : [0xc00033fa40 0xc00033fa80]
E0320 20:11:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:11:33.409809 543705 memory.go:184] no items to output this cycle
I0320 20:11:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 20:11:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:11:43.409787 543705 memory.go:191] Add success.
I0320 20:11:43.409805 543705 cpu.go:282] Add success.
I0320 20:11:43.419751 543705 net.go:648] Add success.
I0320 20:11:43.422549 543705 net.go:770] primary dev: ETH0
I0320 20:11:43.422562 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:11:43.422573 543705 net.go:698] Add success.
I0320 20:11:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:11:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:11:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:11:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:11:53.409778 543705 memory.go:184] no items to output this cycle
I0320 20:11:53.409780 543705 cpu.go:275] no items to output this cycle
E0320 20:12:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:12:03.409777 543705 memory.go:184] no items to output this cycle
I0320 20:12:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 20:12:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:12:13.409813 543705 memory.go:191] Add success.
I0320 20:12:13.409820 543705 cpu.go:282] Add success.
W0320 20:12:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:12:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:12:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:12:13.420155 543705 net.go:648] Add success.
I0320 20:12:13.422658 543705 net.go:770] primary dev: ETH0
I0320 20:12:13.422671 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:12:13.422683 543705 net.go:698] Add success.
I0320 20:12:14.018887 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ea5e1733-4fa7-4679-a16c-671af76ecc2a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 20:12:14.018929 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 20:12:14.454897 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:12:14.454924 543705 disk_worker.go:708] disk space is not compliant
W0320 20:12:14.454929 543705 disk_worker.go:728] disk inode is not compliant
E0320 20:12:14.455662 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 20:12:14.455671 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 20:12:14.455676 543705 custom_config.go:64] query custom config with name: gpu
I0320 20:12:14.456475 543705 disk_worker.go:494] system disk:vda1
I0320 20:12:14.456506 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 20:12:15.456775 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 20:12:15.456785 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:12:16.457898 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 20:12:16.457897 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 20:12:16.457960 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:12:16.457979 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:12:16.472325 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:12:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:12:23.409778 543705 memory.go:184] no items to output this cycle
I0320 20:12:23.409779 543705 cpu.go:275] no items to output this cycle
I0320 20:12:25.925688 543705 disk_info.go:125] begin check local disk info of client
I0320 20:12:25.928119 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:12:25.928124 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047d600 0xc00047d640]
E0320 20:12:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:12:33.409798 543705 memory.go:184] no items to output this cycle
I0320 20:12:33.409809 543705 cpu.go:275] no items to output this cycle
I0320 20:12:38.609736 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 20:12:38.609743 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 20:12:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:12:43.410690 543705 memory.go:191] Add success.
I0320 20:12:43.409807 543705 cpu.go:282] Add success.
I0320 20:12:43.420818 543705 net.go:648] Add success.
I0320 20:12:43.424148 543705 net.go:770] primary dev: ETH0
I0320 20:12:43.424163 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:12:43.424176 543705 net.go:698] Add success.
I0320 20:12:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:12:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:12:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:12:53.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:12:53.409762 543705 memory.go:184] no items to output this cycle
I0320 20:12:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 20:13:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:13:03.409805 543705 memory.go:184] no items to output this cycle
I0320 20:13:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 20:13:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:13:13.409783 543705 memory.go:191] Add success.
W0320 20:13:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 20:13:13.409808 543705 cpu.go:282] Add success.
W0320 20:13:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:13:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:13:13.420108 543705 net.go:648] Add success.
I0320 20:13:13.422870 543705 net.go:770] primary dev: ETH0
I0320 20:13:13.422883 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:13:13.422895 543705 net.go:698] Add success.
I0320 20:13:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:13:14.455133 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:13:14.455219 543705 disk_worker.go:708] disk space is not compliant
W0320 20:13:14.455223 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:13:14.456591 543705 disk_worker.go:494] system disk:vda1
I0320 20:13:14.456625 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:13:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:13:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:13:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:13:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:13:16.472414 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:13:23.410373 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:13:23.410386 543705 memory.go:184] no items to output this cycle
I0320 20:13:23.410388 543705 cpu.go:275] no items to output this cycle
I0320 20:13:25.929673 543705 disk_info.go:125] begin check local disk info of client
I0320 20:13:25.932112 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:13:25.932118 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003c8c40 0xc0003c8c80]
E0320 20:13:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:13:33.409774 543705 memory.go:184] no items to output this cycle
I0320 20:13:33.409781 543705 cpu.go:275] no items to output this cycle
E0320 20:13:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:13:43.409927 543705 memory.go:191] Add success.
I0320 20:13:43.409936 543705 cpu.go:282] Add success.
I0320 20:13:43.419773 543705 net.go:648] Add success.
I0320 20:13:43.422515 543705 net.go:770] primary dev: ETH0
I0320 20:13:43.422530 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:13:43.422543 543705 net.go:698] Add success.
I0320 20:13:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:13:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:13:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:13:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:13:53.409775 543705 memory.go:184] no items to output this cycle
I0320 20:13:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 20:14:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:14:03.409806 543705 memory.go:184] no items to output this cycle
I0320 20:14:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 20:14:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:14:13.409805 543705 memory.go:191] Add success.
I0320 20:14:13.409805 543705 cpu.go:282] Add success.
W0320 20:14:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:14:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:14:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:14:13.420140 543705 net.go:648] Add success.
I0320 20:14:13.422782 543705 net.go:770] primary dev: ETH0
I0320 20:14:13.422795 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:14:13.422808 543705 net.go:698] Add success.
I0320 20:14:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:14:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:14:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0320 20:14:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:14:14.456594 543705 disk_worker.go:494] system disk:vda1
I0320 20:14:14.456622 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:14:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:14:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:14:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:14:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:14:16.472414 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:14:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:14:23.409780 543705 memory.go:184] no items to output this cycle
I0320 20:14:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 20:14:25.933670 543705 disk_info.go:125] begin check local disk info of client
I0320 20:14:25.936206 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:14:25.936212 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a2740 0xc0004a2780]
E0320 20:14:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:14:33.409790 543705 memory.go:184] no items to output this cycle
I0320 20:14:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 20:14:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:14:43.409797 543705 memory.go:191] Add success.
I0320 20:14:43.409810 543705 cpu.go:282] Add success.
I0320 20:14:43.420339 543705 net.go:648] Add success.
I0320 20:14:43.423091 543705 net.go:770] primary dev: ETH0
I0320 20:14:43.423103 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:14:43.423115 543705 net.go:698] Add success.
I0320 20:14:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:14:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:14:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:14:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:14:53.409780 543705 cpu.go:275] no items to output this cycle
I0320 20:14:53.409786 543705 memory.go:184] no items to output this cycle
E0320 20:15:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:15:03.409791 543705 cpu.go:275] no items to output this cycle
I0320 20:15:03.409795 543705 memory.go:184] no items to output this cycle
E0320 20:15:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:15:13.409798 543705 memory.go:191] Add success.
I0320 20:15:13.409799 543705 cpu.go:282] Add success.
W0320 20:15:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:15:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:15:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:15:13.420051 543705 net.go:648] Add success.
I0320 20:15:13.422806 543705 net.go:770] primary dev: ETH0
I0320 20:15:13.422819 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:15:13.422831 543705 net.go:698] Add success.
I0320 20:15:13.468807 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"adfdd83d-fcfb-477e-9e3b-ba1c8ae5024c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 20:15:13.468842 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 20:15:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:15:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:15:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 20:15:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:15:14.456512 543705 disk_worker.go:494] system disk:vda1
I0320 20:15:14.456558 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:15:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:15:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:15:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:15:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:15:16.472379 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:15:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:15:23.409766 543705 memory.go:184] no items to output this cycle
I0320 20:15:23.409798 543705 cpu.go:275] no items to output this cycle
I0320 20:15:25.937674 543705 disk_info.go:125] begin check local disk info of client
I0320 20:15:25.940127 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:15:25.940134 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039a640 0xc00039a680]
E0320 20:15:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:15:33.409771 543705 memory.go:184] no items to output this cycle
I0320 20:15:33.409805 543705 cpu.go:275] no items to output this cycle
I0320 20:15:38.610759 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 20:15:38.610765 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 20:15:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:15:43.410599 543705 memory.go:191] Add success.
I0320 20:15:43.409803 543705 cpu.go:282] Add success.
I0320 20:15:43.420341 543705 net.go:648] Add success.
I0320 20:15:43.423215 543705 net.go:770] primary dev: ETH0
I0320 20:15:43.423233 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:15:43.423245 543705 net.go:698] Add success.
I0320 20:15:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:15:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:15:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:15:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:15:53.409801 543705 memory.go:184] no items to output this cycle
I0320 20:15:53.409811 543705 cpu.go:275] no items to output this cycle
E0320 20:16:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:16:03.409781 543705 memory.go:184] no items to output this cycle
I0320 20:16:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 20:16:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:16:13.409818 543705 memory.go:191] Add success.
I0320 20:16:13.409820 543705 cpu.go:282] Add success.
W0320 20:16:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:16:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:16:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:16:13.420269 543705 net.go:648] Add success.
I0320 20:16:13.423120 543705 net.go:770] primary dev: ETH0
I0320 20:16:13.423134 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:16:13.423146 543705 net.go:698] Add success.
I0320 20:16:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:16:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:16:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 20:16:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:16:14.456577 543705 disk_worker.go:494] system disk:vda1
I0320 20:16:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:16:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:16:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:16:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:16:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:16:16.472365 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:16:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:16:23.409797 543705 memory.go:184] no items to output this cycle
I0320 20:16:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 20:16:25.941672 543705 disk_info.go:125] begin check local disk info of client
I0320 20:16:25.944171 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:16:25.944177 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004aedc0 0xc0004aee00]
E0320 20:16:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:16:33.409772 543705 memory.go:184] no items to output this cycle
I0320 20:16:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 20:16:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:16:43.409792 543705 memory.go:191] Add success.
I0320 20:16:43.409807 543705 cpu.go:282] Add success.
I0320 20:16:43.419953 543705 net.go:648] Add success.
I0320 20:16:43.422763 543705 net.go:770] primary dev: ETH0
I0320 20:16:43.422778 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:16:43.422792 543705 net.go:698] Add success.
I0320 20:16:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:16:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:16:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:16:53.410657 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:16:53.410671 543705 memory.go:184] no items to output this cycle
I0320 20:16:53.410672 543705 cpu.go:275] no items to output this cycle
E0320 20:17:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:17:03.409784 543705 memory.go:184] no items to output this cycle
I0320 20:17:03.409790 543705 cpu.go:275] no items to output this cycle
E0320 20:17:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:17:13.409789 543705 memory.go:191] Add success.
I0320 20:17:13.409790 543705 cpu.go:282] Add success.
W0320 20:17:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:17:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:17:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:17:13.420045 543705 net.go:648] Add success.
I0320 20:17:13.422706 543705 net.go:770] primary dev: ETH0
I0320 20:17:13.422719 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:17:13.422731 543705 net.go:698] Add success.
I0320 20:17:13.453387 543705 event_worker.go:152] Polling the log file for events...
W0320 20:17:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:17:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0320 20:17:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:17:14.456470 543705 disk_worker.go:494] system disk:vda1
I0320 20:17:14.456497 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 20:17:14.456900 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 20:17:14.456909 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 20:17:14.456915 543705 custom_config.go:64] query custom config with name: gpu
E0320 20:17:15.456819 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 20:17:15.456827 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:17:16.457951 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 20:17:16.457950 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 20:17:16.458006 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:17:16.458027 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:17:16.472375 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:17:23.410356 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:17:23.410371 543705 memory.go:184] no items to output this cycle
I0320 20:17:23.410379 543705 cpu.go:275] no items to output this cycle
I0320 20:17:25.945670 543705 disk_info.go:125] begin check local disk info of client
I0320 20:17:25.948133 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:17:25.948141 543705 disk_info.go:196] parse disk info done, disk is : [0xc000484000 0xc000484040]
E0320 20:17:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:17:33.409799 543705 memory.go:184] no items to output this cycle
I0320 20:17:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 20:17:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:17:43.409802 543705 cpu.go:282] Add success.
I0320 20:17:43.409807 543705 memory.go:191] Add success.
I0320 20:17:43.420002 543705 net.go:648] Add success.
I0320 20:17:43.422688 543705 net.go:770] primary dev: ETH0
I0320 20:17:43.422702 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:17:43.422716 543705 net.go:698] Add success.
I0320 20:17:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:17:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:17:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:17:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:17:53.409784 543705 memory.go:184] no items to output this cycle
I0320 20:17:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 20:18:03.409875 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:18:03.409906 543705 memory.go:184] no items to output this cycle
I0320 20:18:03.409923 543705 cpu.go:275] no items to output this cycle
E0320 20:18:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:18:13.409813 543705 memory.go:191] Add success.
I0320 20:18:13.409820 543705 cpu.go:282] Add success.
W0320 20:18:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:18:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:18:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:18:13.420126 543705 net.go:648] Add success.
I0320 20:18:13.422844 543705 net.go:770] primary dev: ETH0
I0320 20:18:13.422857 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:18:13.422868 543705 net.go:698] Add success.
I0320 20:18:13.468796 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e8bf7550-ff7d-439e-b597-66b789996ca8","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 20:18:13.468831 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 20:18:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:18:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:18:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 20:18:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:18:14.456518 543705 disk_worker.go:494] system disk:vda1
I0320 20:18:14.456563 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:18:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:18:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:18:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:18:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:18:16.472378 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:18:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:18:23.409775 543705 cpu.go:275] no items to output this cycle
I0320 20:18:23.409780 543705 memory.go:184] no items to output this cycle
I0320 20:18:25.949671 543705 disk_info.go:125] begin check local disk info of client
I0320 20:18:25.952134 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:18:25.952140 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003efc00 0xc0003efc40]
E0320 20:18:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:18:33.409788 543705 cpu.go:275] no items to output this cycle
I0320 20:18:33.409794 543705 memory.go:184] no items to output this cycle
I0320 20:18:38.611760 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 20:18:38.611767 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 20:18:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:18:43.410813 543705 memory.go:191] Add success.
I0320 20:18:43.409805 543705 cpu.go:282] Add success.
I0320 20:18:43.420524 543705 net.go:648] Add success.
I0320 20:18:43.423092 543705 net.go:770] primary dev: ETH0
I0320 20:18:43.423107 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:18:43.423122 543705 net.go:698] Add success.
I0320 20:18:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:18:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:18:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:18:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:18:53.409771 543705 memory.go:184] no items to output this cycle
I0320 20:18:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 20:19:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:19:03.409783 543705 memory.go:184] no items to output this cycle
I0320 20:19:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 20:19:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:19:13.409800 543705 memory.go:191] Add success.
I0320 20:19:13.409801 543705 cpu.go:282] Add success.
W0320 20:19:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:19:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:19:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:19:13.420264 543705 net.go:648] Add success.
I0320 20:19:13.422977 543705 net.go:770] primary dev: ETH0
I0320 20:19:13.423001 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:19:13.423015 543705 net.go:698] Add success.
I0320 20:19:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:19:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:19:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0320 20:19:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:19:14.456595 543705 disk_worker.go:494] system disk:vda1
I0320 20:19:14.456629 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:19:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:19:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:19:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:19:16.458077 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:19:16.472399 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:19:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:19:23.409792 543705 memory.go:184] no items to output this cycle
I0320 20:19:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 20:19:25.953671 543705 disk_info.go:125] begin check local disk info of client
I0320 20:19:25.956089 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:19:25.956095 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037f440 0xc00037f480]
E0320 20:19:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:19:33.409777 543705 memory.go:184] no items to output this cycle
I0320 20:19:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 20:19:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:19:43.409788 543705 memory.go:191] Add success.
I0320 20:19:43.409788 543705 cpu.go:282] Add success.
I0320 20:19:43.419718 543705 net.go:648] Add success.
I0320 20:19:43.422894 543705 net.go:770] primary dev: ETH0
I0320 20:19:43.422909 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:19:43.422922 543705 net.go:698] Add success.
I0320 20:19:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:19:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:19:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:19:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:19:53.409777 543705 memory.go:184] no items to output this cycle
I0320 20:19:53.409782 543705 cpu.go:275] no items to output this cycle
E0320 20:20:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:20:03.409776 543705 memory.go:184] no items to output this cycle
I0320 20:20:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 20:20:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:20:13.409820 543705 memory.go:191] Add success.
I0320 20:20:13.409827 543705 cpu.go:282] Add success.
W0320 20:20:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:20:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:20:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:20:13.420139 543705 net.go:648] Add success.
I0320 20:20:13.422909 543705 net.go:770] primary dev: ETH0
I0320 20:20:13.422923 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:20:13.422936 543705 net.go:698] Add success.
I0320 20:20:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:20:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:20:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 20:20:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:20:14.456514 543705 disk_worker.go:494] system disk:vda1
I0320 20:20:14.456561 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:20:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:20:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:20:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:20:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:20:16.472409 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:20:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:20:23.409780 543705 memory.go:184] no items to output this cycle
I0320 20:20:23.409784 543705 cpu.go:275] no items to output this cycle
I0320 20:20:25.957672 543705 disk_info.go:125] begin check local disk info of client
I0320 20:20:25.960144 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:20:25.960150 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ae140 0xc0003ae180]
E0320 20:20:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:20:33.409793 543705 memory.go:184] no items to output this cycle
I0320 20:20:33.409821 543705 cpu.go:275] no items to output this cycle
E0320 20:20:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:20:43.409797 543705 memory.go:191] Add success.
I0320 20:20:43.409798 543705 cpu.go:282] Add success.
I0320 20:20:43.420092 543705 net.go:648] Add success.
I0320 20:20:43.423644 543705 net.go:770] primary dev: ETH0
I0320 20:20:43.423657 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:20:43.423669 543705 net.go:698] Add success.
I0320 20:20:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:20:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:20:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:20:53.410253 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:20:53.410269 543705 memory.go:184] no items to output this cycle
I0320 20:20:53.410268 543705 cpu.go:275] no items to output this cycle
E0320 20:21:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:21:03.409781 543705 memory.go:184] no items to output this cycle
I0320 20:21:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 20:21:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:21:13.409794 543705 memory.go:191] Add success.
I0320 20:21:13.409794 543705 cpu.go:282] Add success.
W0320 20:21:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:21:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:21:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:21:13.420132 543705 net.go:648] Add success.
I0320 20:21:13.422892 543705 net.go:770] primary dev: ETH0
I0320 20:21:13.422904 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:21:13.422916 543705 net.go:698] Add success.
I0320 20:21:13.469233 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4466d0e9-831e-49ca-99b1-24d140a5eec3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 20:21:13.469268 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 20:21:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:21:14.455118 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:21:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0320 20:21:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:21:14.456623 543705 disk_worker.go:494] system disk:vda1
I0320 20:21:14.456653 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:21:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:21:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:21:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:21:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:21:16.472476 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:21:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:21:23.409773 543705 memory.go:184] no items to output this cycle
I0320 20:21:23.409788 543705 cpu.go:275] no items to output this cycle
I0320 20:21:25.961672 543705 disk_info.go:125] begin check local disk info of client
I0320 20:21:25.964141 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:21:25.964148 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039f440 0xc00039f480]
E0320 20:21:33.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:21:33.409810 543705 memory.go:184] no items to output this cycle
I0320 20:21:33.409823 543705 cpu.go:275] no items to output this cycle
I0320 20:21:38.612762 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 20:21:38.612769 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 20:21:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:21:43.410705 543705 memory.go:191] Add success.
I0320 20:21:43.409786 543705 cpu.go:282] Add success.
I0320 20:21:43.420638 543705 net.go:648] Add success.
I0320 20:21:43.423133 543705 net.go:770] primary dev: ETH0
I0320 20:21:43.423147 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:21:43.423158 543705 net.go:698] Add success.
I0320 20:21:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:21:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:21:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:21:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:21:53.409781 543705 memory.go:184] no items to output this cycle
I0320 20:21:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 20:22:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:22:03.409771 543705 memory.go:184] no items to output this cycle
I0320 20:22:03.409809 543705 cpu.go:275] no items to output this cycle
E0320 20:22:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:22:13.409821 543705 memory.go:191] Add success.
I0320 20:22:13.409830 543705 cpu.go:282] Add success.
W0320 20:22:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:22:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:22:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:22:13.420111 543705 net.go:648] Add success.
I0320 20:22:13.422745 543705 net.go:770] primary dev: ETH0
I0320 20:22:13.422759 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:22:13.422771 543705 net.go:698] Add success.
W0320 20:22:14.455142 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:22:14.455152 543705 disk_worker.go:708] disk space is not compliant
W0320 20:22:14.455155 543705 disk_worker.go:728] disk inode is not compliant
E0320 20:22:14.456893 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 20:22:14.456902 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 20:22:14.456909 543705 custom_config.go:64] query custom config with name: gpu
I0320 20:22:14.456974 543705 disk_worker.go:494] system disk:vda1
I0320 20:22:14.457015 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 20:22:15.456836 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 20:22:15.456845 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:22:16.457945 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 20:22:16.457955 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 20:22:16.457999 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:22:16.458016 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:22:16.472360 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:22:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:22:23.409800 543705 memory.go:184] no items to output this cycle
I0320 20:22:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 20:22:25.965680 543705 disk_info.go:125] begin check local disk info of client
I0320 20:22:25.968169 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:22:25.968176 543705 disk_info.go:196] parse disk info done, disk is : [0xc000272680 0xc0002726c0]
E0320 20:22:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:22:33.409777 543705 memory.go:184] no items to output this cycle
I0320 20:22:33.409786 543705 cpu.go:275] no items to output this cycle
E0320 20:22:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:22:43.409801 543705 memory.go:191] Add success.
I0320 20:22:43.409802 543705 cpu.go:282] Add success.
I0320 20:22:43.420243 543705 net.go:648] Add success.
I0320 20:22:43.422776 543705 net.go:770] primary dev: ETH0
I0320 20:22:43.422790 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:22:43.422802 543705 net.go:698] Add success.
I0320 20:22:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:22:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:22:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:22:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:22:53.409764 543705 memory.go:184] no items to output this cycle
I0320 20:22:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 20:23:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:23:03.409807 543705 memory.go:184] no items to output this cycle
I0320 20:23:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 20:23:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:23:13.409796 543705 memory.go:191] Add success.
I0320 20:23:13.409798 543705 cpu.go:282] Add success.
W0320 20:23:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:23:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:23:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:23:13.420098 543705 net.go:648] Add success.
I0320 20:23:13.422858 543705 net.go:770] primary dev: ETH0
I0320 20:23:13.422870 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:23:13.422882 543705 net.go:698] Add success.
I0320 20:23:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:23:14.455129 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:23:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0320 20:23:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:23:14.456579 543705 disk_worker.go:494] system disk:vda1
I0320 20:23:14.456608 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:23:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:23:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:23:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:23:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:23:16.472414 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:23:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:23:23.409800 543705 memory.go:184] no items to output this cycle
I0320 20:23:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 20:23:25.969676 543705 disk_info.go:125] begin check local disk info of client
I0320 20:23:25.972091 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:23:25.972098 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003bbb00 0xc0003bbb40]
E0320 20:23:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:23:33.409771 543705 memory.go:184] no items to output this cycle
I0320 20:23:33.409794 543705 cpu.go:275] no items to output this cycle
E0320 20:23:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:23:43.409812 543705 memory.go:191] Add success.
I0320 20:23:43.409818 543705 cpu.go:282] Add success.
I0320 20:23:43.419871 543705 net.go:648] Add success.
I0320 20:23:43.423038 543705 net.go:770] primary dev: ETH0
I0320 20:23:43.423054 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:23:43.423068 543705 net.go:698] Add success.
I0320 20:23:46.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:23:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:23:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:23:53.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:23:53.409810 543705 memory.go:184] no items to output this cycle
I0320 20:23:53.409822 543705 cpu.go:275] no items to output this cycle
E0320 20:24:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:24:03.409810 543705 memory.go:184] no items to output this cycle
I0320 20:24:03.409822 543705 cpu.go:275] no items to output this cycle
E0320 20:24:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:24:13.409788 543705 memory.go:191] Add success.
I0320 20:24:13.409808 543705 cpu.go:282] Add success.
W0320 20:24:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:24:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:24:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:24:13.420042 543705 net.go:648] Add success.
I0320 20:24:13.422869 543705 net.go:770] primary dev: ETH0
I0320 20:24:13.422882 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:24:13.422894 543705 net.go:698] Add success.
I0320 20:24:13.487666 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"085f606c-7d8f-4d7c-abe2-9b82f5d79d90","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 20:24:13.487701 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 20:24:14.454953 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:24:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:24:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 20:24:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:24:14.456601 543705 disk_worker.go:494] system disk:vda1
I0320 20:24:14.456631 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:24:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:24:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:24:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:24:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:24:16.472374 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:24:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:24:23.409769 543705 memory.go:184] no items to output this cycle
I0320 20:24:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 20:24:25.973684 543705 disk_info.go:125] begin check local disk info of client
I0320 20:24:25.976180 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:24:25.976186 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039fdc0 0xc00039fe00]
E0320 20:24:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:24:33.409773 543705 memory.go:184] no items to output this cycle
I0320 20:24:33.409806 543705 cpu.go:275] no items to output this cycle
I0320 20:24:38.613733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 20:24:38.613740 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 20:24:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:24:43.410529 543705 memory.go:191] Add success.
I0320 20:24:43.409793 543705 cpu.go:282] Add success.
I0320 20:24:43.420314 543705 net.go:648] Add success.
I0320 20:24:43.422803 543705 net.go:770] primary dev: ETH0
I0320 20:24:43.422816 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:24:43.422828 543705 net.go:698] Add success.
I0320 20:24:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:24:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:24:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:24:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:24:53.409771 543705 memory.go:184] no items to output this cycle
I0320 20:24:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 20:25:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:25:03.409785 543705 memory.go:184] no items to output this cycle
I0320 20:25:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 20:25:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:25:13.409799 543705 cpu.go:282] Add success.
I0320 20:25:13.409807 543705 memory.go:191] Add success.
W0320 20:25:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:25:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:25:13.409851 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:25:13.420158 543705 net.go:648] Add success.
I0320 20:25:13.422894 543705 net.go:770] primary dev: ETH0
I0320 20:25:13.422907 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:25:13.422919 543705 net.go:698] Add success.
I0320 20:25:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:25:14.455126 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:25:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 20:25:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:25:14.456529 543705 disk_worker.go:494] system disk:vda1
I0320 20:25:14.456575 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:25:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:25:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:25:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:25:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:25:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:25:23.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:25:23.409806 543705 memory.go:184] no items to output this cycle
I0320 20:25:23.409817 543705 cpu.go:275] no items to output this cycle
I0320 20:25:25.977671 543705 disk_info.go:125] begin check local disk info of client
I0320 20:25:25.980120 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:25:25.980127 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a6340 0xc0004a6380]
E0320 20:25:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:25:33.409777 543705 memory.go:184] no items to output this cycle
I0320 20:25:33.409798 543705 cpu.go:275] no items to output this cycle
E0320 20:25:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:25:43.409792 543705 memory.go:191] Add success.
I0320 20:25:43.409806 543705 cpu.go:282] Add success.
I0320 20:25:43.419928 543705 net.go:648] Add success.
I0320 20:25:43.422509 543705 net.go:770] primary dev: ETH0
I0320 20:25:43.422521 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:25:43.422533 543705 net.go:698] Add success.
I0320 20:25:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:25:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:25:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:25:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:25:53.409801 543705 memory.go:184] no items to output this cycle
I0320 20:25:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 20:26:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:26:03.409812 543705 memory.go:184] no items to output this cycle
I0320 20:26:03.409822 543705 cpu.go:275] no items to output this cycle
E0320 20:26:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:26:13.409787 543705 memory.go:191] Add success.
W0320 20:26:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 20:26:13.409816 543705 cpu.go:282] Add success.
W0320 20:26:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:26:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:26:13.420156 543705 net.go:648] Add success.
I0320 20:26:13.422849 543705 net.go:770] primary dev: ETH0
I0320 20:26:13.422865 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:26:13.422878 543705 net.go:698] Add success.
I0320 20:26:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:26:14.455203 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:26:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0320 20:26:14.455216 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:26:14.456593 543705 disk_worker.go:494] system disk:vda1
I0320 20:26:14.456623 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:26:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:26:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:26:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:26:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:26:16.472422 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:26:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:26:23.409774 543705 memory.go:184] no items to output this cycle
I0320 20:26:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 20:26:25.981674 543705 disk_info.go:125] begin check local disk info of client
I0320 20:26:25.984179 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:26:25.984185 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ba340 0xc0003ba380]
E0320 20:26:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:26:33.409792 543705 memory.go:184] no items to output this cycle
I0320 20:26:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 20:26:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:26:43.409793 543705 memory.go:191] Add success.
I0320 20:26:43.409808 543705 cpu.go:282] Add success.
I0320 20:26:43.419876 543705 net.go:648] Add success.
I0320 20:26:43.422673 543705 net.go:770] primary dev: ETH0
I0320 20:26:43.422687 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:26:43.422700 543705 net.go:698] Add success.
I0320 20:26:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:26:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:26:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:26:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:26:53.409785 543705 memory.go:184] no items to output this cycle
I0320 20:26:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 20:27:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:27:03.409807 543705 memory.go:184] no items to output this cycle
I0320 20:27:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 20:27:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:27:13.409785 543705 memory.go:191] Add success.
I0320 20:27:13.409808 543705 cpu.go:282] Add success.
W0320 20:27:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:27:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:27:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:27:13.420204 543705 net.go:648] Add success.
I0320 20:27:13.429063 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 20:27:13.429137 543705 net.go:770] primary dev: ETH0
I0320 20:27:13.429150 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:27:13.429169 543705 net.go:698] Add success.
I0320 20:27:13.453717 543705 event_worker.go:152] Polling the log file for events...
I0320 20:27:13.485743 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"851aef01-b83d-4b58-b80e-7f0e01ce276b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 20:27:13.485776 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 20:27:14.455108 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:27:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0320 20:27:14.455173 543705 disk_worker.go:728] disk inode is not compliant
E0320 20:27:14.456904 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 20:27:14.456913 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 20:27:14.456918 543705 custom_config.go:64] query custom config with name: gpu
I0320 20:27:14.456990 543705 disk_worker.go:494] system disk:vda1
I0320 20:27:14.457018 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 20:27:15.456809 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 20:27:15.456817 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:27:16.457925 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 20:27:16.457924 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 20:27:16.457978 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:27:16.457997 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:27:16.472311 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:27:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:27:23.409774 543705 memory.go:184] no items to output this cycle
I0320 20:27:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 20:27:25.985673 543705 disk_info.go:125] begin check local disk info of client
I0320 20:27:25.988140 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:27:25.988147 543705 disk_info.go:196] parse disk info done, disk is : [0xc000357000 0xc000357040]
E0320 20:27:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:27:33.409770 543705 memory.go:184] no items to output this cycle
I0320 20:27:33.409788 543705 cpu.go:275] no items to output this cycle
I0320 20:27:38.613883 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 20:27:38.613890 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 20:27:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:27:43.410675 543705 memory.go:191] Add success.
I0320 20:27:43.409806 543705 cpu.go:282] Add success.
I0320 20:27:43.420354 543705 net.go:648] Add success.
I0320 20:27:43.422977 543705 net.go:770] primary dev: ETH0
I0320 20:27:43.422989 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:27:43.423001 543705 net.go:698] Add success.
I0320 20:27:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:27:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:27:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:27:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:27:53.409797 543705 memory.go:184] no items to output this cycle
I0320 20:27:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 20:28:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:28:03.409813 543705 memory.go:184] no items to output this cycle
I0320 20:28:03.409822 543705 cpu.go:275] no items to output this cycle
E0320 20:28:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:28:13.409784 543705 memory.go:191] Add success.
W0320 20:28:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 20:28:13.409814 543705 cpu.go:282] Add success.
W0320 20:28:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:28:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:28:13.420123 543705 net.go:648] Add success.
I0320 20:28:13.422806 543705 net.go:770] primary dev: ETH0
I0320 20:28:13.422820 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:28:13.422834 543705 net.go:698] Add success.
I0320 20:28:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:28:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:28:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0320 20:28:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:28:14.456587 543705 disk_worker.go:494] system disk:vda1
I0320 20:28:14.456618 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:28:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:28:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:28:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:28:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:28:16.472397 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:28:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:28:23.409779 543705 memory.go:184] no items to output this cycle
I0320 20:28:23.409800 543705 cpu.go:275] no items to output this cycle
I0320 20:28:25.989676 543705 disk_info.go:125] begin check local disk info of client
I0320 20:28:25.992174 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:28:25.992180 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003d5d80 0xc0003d5dc0]
E0320 20:28:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:28:33.409800 543705 memory.go:184] no items to output this cycle
I0320 20:28:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 20:28:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:28:43.409793 543705 cpu.go:282] Add success.
I0320 20:28:43.409797 543705 memory.go:191] Add success.
I0320 20:28:43.420051 543705 net.go:648] Add success.
I0320 20:28:43.422830 543705 net.go:770] primary dev: ETH0
I0320 20:28:43.422843 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:28:43.422854 543705 net.go:698] Add success.
I0320 20:28:46.458001 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:28:46.458062 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:28:46.458091 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:28:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:28:53.409798 543705 memory.go:184] no items to output this cycle
I0320 20:28:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 20:29:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:29:03.409789 543705 cpu.go:275] no items to output this cycle
I0320 20:29:03.409796 543705 memory.go:184] no items to output this cycle
E0320 20:29:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:29:13.409823 543705 memory.go:191] Add success.
I0320 20:29:13.409832 543705 cpu.go:282] Add success.
W0320 20:29:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:29:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:29:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:29:13.420271 543705 net.go:648] Add success.
I0320 20:29:13.422957 543705 net.go:770] primary dev: ETH0
I0320 20:29:13.422970 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:29:13.422982 543705 net.go:698] Add success.
I0320 20:29:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:29:14.455207 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:29:14.455219 543705 disk_worker.go:708] disk space is not compliant
W0320 20:29:14.455222 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:29:14.456601 543705 disk_worker.go:494] system disk:vda1
I0320 20:29:14.456631 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:29:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:29:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:29:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:29:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:29:16.472378 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:29:23.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:29:23.409813 543705 memory.go:184] no items to output this cycle
I0320 20:29:23.409824 543705 cpu.go:275] no items to output this cycle
I0320 20:29:25.993673 543705 disk_info.go:125] begin check local disk info of client
I0320 20:29:25.996158 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:29:25.996165 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5840 0xc0000c5880]
E0320 20:29:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:29:33.409795 543705 memory.go:184] no items to output this cycle
I0320 20:29:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 20:29:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:29:43.409792 543705 cpu.go:282] Add success.
I0320 20:29:43.409802 543705 memory.go:191] Add success.
I0320 20:29:43.419983 543705 net.go:648] Add success.
I0320 20:29:43.422715 543705 net.go:770] primary dev: ETH0
I0320 20:29:43.422727 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:29:43.422744 543705 net.go:698] Add success.
I0320 20:29:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:29:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:29:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:29:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:29:53.409774 543705 memory.go:184] no items to output this cycle
I0320 20:29:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 20:30:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:30:03.409796 543705 memory.go:184] no items to output this cycle
I0320 20:30:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 20:30:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:30:13.409787 543705 memory.go:191] Add success.
I0320 20:30:13.409804 543705 cpu.go:282] Add success.
W0320 20:30:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:30:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:30:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:30:13.420129 543705 net.go:648] Add success.
I0320 20:30:13.422876 543705 net.go:770] primary dev: ETH0
I0320 20:30:13.422891 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:30:13.422905 543705 net.go:698] Add success.
I0320 20:30:14.203541 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d7efcc6b-e6c7-434a-9767-f99f9375396f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 20:30:14.203581 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 20:30:14.453981 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:30:14.454251 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:30:14.454263 543705 disk_worker.go:708] disk space is not compliant
W0320 20:30:14.454267 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:30:14.455819 543705 disk_worker.go:494] system disk:vda1
I0320 20:30:14.455850 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:30:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:30:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:30:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:30:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:30:16.472359 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:30:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:30:23.409780 543705 memory.go:184] no items to output this cycle
I0320 20:30:23.409794 543705 cpu.go:275] no items to output this cycle
I0320 20:30:25.997675 543705 disk_info.go:125] begin check local disk info of client
I0320 20:30:26.000187 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:30:26.000193 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4d00 0xc0000c4d40]
E0320 20:30:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:30:33.409800 543705 memory.go:184] no items to output this cycle
I0320 20:30:33.409814 543705 cpu.go:275] no items to output this cycle
I0320 20:30:38.614032 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 20:30:38.614039 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 20:30:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:30:43.410534 543705 memory.go:191] Add success.
I0320 20:30:43.409824 543705 cpu.go:282] Add success.
I0320 20:30:43.420305 543705 net.go:648] Add success.
I0320 20:30:43.423067 543705 net.go:770] primary dev: ETH0
I0320 20:30:43.423084 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:30:43.423097 543705 net.go:698] Add success.
I0320 20:30:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:30:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:30:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:30:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:30:53.409800 543705 memory.go:184] no items to output this cycle
I0320 20:30:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 20:31:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:31:03.409771 543705 memory.go:184] no items to output this cycle
I0320 20:31:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 20:31:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:31:13.409813 543705 memory.go:191] Add success.
I0320 20:31:13.409816 543705 cpu.go:282] Add success.
W0320 20:31:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:31:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:31:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:31:13.420098 543705 net.go:648] Add success.
I0320 20:31:13.423150 543705 net.go:770] primary dev: ETH0
I0320 20:31:13.423162 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:31:13.423174 543705 net.go:698] Add success.
I0320 20:31:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:31:14.455108 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:31:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 20:31:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:31:14.456577 543705 disk_worker.go:494] system disk:vda1
I0320 20:31:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:31:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:31:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:31:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:31:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:31:16.472359 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:31:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:31:23.409788 543705 memory.go:184] no items to output this cycle
I0320 20:31:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 20:31:26.001680 543705 disk_info.go:125] begin check local disk info of client
I0320 20:31:26.004183 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:31:26.004188 543705 disk_info.go:196] parse disk info done, disk is : [0xc000340d80 0xc000340dc0]
E0320 20:31:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:31:33.409782 543705 cpu.go:275] no items to output this cycle
I0320 20:31:33.409786 543705 memory.go:184] no items to output this cycle
E0320 20:31:43.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:31:43.409825 543705 memory.go:191] Add success.
I0320 20:31:43.409826 543705 cpu.go:282] Add success.
I0320 20:31:43.419956 543705 net.go:648] Add success.
I0320 20:31:43.422554 543705 net.go:770] primary dev: ETH0
I0320 20:31:43.422569 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:31:43.422583 543705 net.go:698] Add success.
I0320 20:31:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:31:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:31:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:31:53.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:31:53.409807 543705 memory.go:184] no items to output this cycle
I0320 20:31:53.409815 543705 cpu.go:275] no items to output this cycle
E0320 20:32:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:32:03.409795 543705 memory.go:184] no items to output this cycle
I0320 20:32:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 20:32:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:32:13.409801 543705 memory.go:191] Add success.
I0320 20:32:13.409804 543705 cpu.go:282] Add success.
W0320 20:32:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:32:13.412395 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:32:13.412400 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:32:13.420045 543705 net.go:648] Add success.
I0320 20:32:13.421802 543705 net.go:770] primary dev: ETH0
I0320 20:32:13.421815 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:32:13.421827 543705 net.go:698] Add success.
W0320 20:32:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:32:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0320 20:32:14.455192 543705 disk_worker.go:728] disk inode is not compliant
E0320 20:32:14.455936 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 20:32:14.455944 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 20:32:14.455950 543705 custom_config.go:64] query custom config with name: gpu
I0320 20:32:14.456543 543705 disk_worker.go:494] system disk:vda1
I0320 20:32:14.456574 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 20:32:15.456801 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 20:32:15.456810 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:32:16.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 20:32:16.457991 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 20:32:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:32:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:32:16.472461 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:32:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:32:23.409777 543705 memory.go:184] no items to output this cycle
I0320 20:32:23.409812 543705 cpu.go:275] no items to output this cycle
I0320 20:32:26.005672 543705 disk_info.go:125] begin check local disk info of client
I0320 20:32:26.008182 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:32:26.008188 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035e500 0xc00035e540]
E0320 20:32:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:32:33.409806 543705 memory.go:184] no items to output this cycle
I0320 20:32:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 20:32:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:32:43.409802 543705 memory.go:191] Add success.
I0320 20:32:43.409806 543705 cpu.go:282] Add success.
I0320 20:32:43.420196 543705 net.go:648] Add success.
I0320 20:32:43.423143 543705 net.go:770] primary dev: ETH0
I0320 20:32:43.423161 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:32:43.423195 543705 net.go:698] Add success.
I0320 20:32:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:32:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:32:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:32:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:32:53.409786 543705 memory.go:184] no items to output this cycle
I0320 20:32:53.409816 543705 cpu.go:275] no items to output this cycle
E0320 20:33:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:33:03.409775 543705 memory.go:184] no items to output this cycle
I0320 20:33:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 20:33:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:33:13.409798 543705 memory.go:191] Add success.
I0320 20:33:13.409804 543705 cpu.go:282] Add success.
W0320 20:33:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:33:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:33:13.409851 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:33:13.420066 543705 net.go:648] Add success.
I0320 20:33:13.423061 543705 net.go:770] primary dev: ETH0
I0320 20:33:13.423078 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:33:13.423091 543705 net.go:698] Add success.
I0320 20:33:13.471757 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f44fee3a-1f46-4a63-8da4-88f1d01de75d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 20:33:13.471792 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 20:33:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:33:14.455212 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:33:14.455222 543705 disk_worker.go:708] disk space is not compliant
W0320 20:33:14.455225 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:33:14.456614 543705 disk_worker.go:494] system disk:vda1
I0320 20:33:14.456644 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:33:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:33:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:33:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:33:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:33:16.472435 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:33:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:33:23.409801 543705 memory.go:184] no items to output this cycle
I0320 20:33:23.409812 543705 cpu.go:275] no items to output this cycle
I0320 20:33:26.009673 543705 disk_info.go:125] begin check local disk info of client
I0320 20:33:26.012224 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:33:26.012229 543705 disk_info.go:196] parse disk info done, disk is : [0xc000390580 0xc0003905c0]
E0320 20:33:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:33:33.409808 543705 memory.go:184] no items to output this cycle
I0320 20:33:33.409808 543705 cpu.go:275] no items to output this cycle
I0320 20:33:38.614180 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 20:33:38.614187 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 20:33:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:33:43.410747 543705 memory.go:191] Add success.
I0320 20:33:43.409816 543705 cpu.go:282] Add success.
I0320 20:33:43.420695 543705 net.go:648] Add success.
I0320 20:33:43.423699 543705 net.go:770] primary dev: ETH0
I0320 20:33:43.423714 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:33:43.423727 543705 net.go:698] Add success.
I0320 20:33:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:33:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:33:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:33:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:33:53.409785 543705 memory.go:184] no items to output this cycle
I0320 20:33:53.409816 543705 cpu.go:275] no items to output this cycle
E0320 20:34:03.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:34:03.409814 543705 memory.go:184] no items to output this cycle
I0320 20:34:03.409832 543705 cpu.go:275] no items to output this cycle
E0320 20:34:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:34:13.409808 543705 memory.go:191] Add success.
I0320 20:34:13.409809 543705 cpu.go:282] Add success.
W0320 20:34:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:34:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:34:13.409851 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:34:13.420195 543705 net.go:648] Add success.
I0320 20:34:13.423064 543705 net.go:770] primary dev: ETH0
I0320 20:34:13.423077 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:34:13.423088 543705 net.go:698] Add success.
I0320 20:34:14.454992 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:34:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:34:14.455225 543705 disk_worker.go:708] disk space is not compliant
W0320 20:34:14.455228 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:34:14.456632 543705 disk_worker.go:494] system disk:vda1
I0320 20:34:14.456667 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:34:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:34:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:34:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:34:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:34:16.472376 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:34:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:34:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 20:34:23.409788 543705 memory.go:184] no items to output this cycle
I0320 20:34:26.013675 543705 disk_info.go:125] begin check local disk info of client
I0320 20:34:26.016150 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:34:26.016156 543705 disk_info.go:196] parse disk info done, disk is : [0xc000470640 0xc000470680]
E0320 20:34:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:34:33.409775 543705 memory.go:184] no items to output this cycle
I0320 20:34:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 20:34:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:34:43.409799 543705 memory.go:191] Add success.
I0320 20:34:43.409802 543705 cpu.go:282] Add success.
I0320 20:34:43.420289 543705 net.go:648] Add success.
I0320 20:34:43.423016 543705 net.go:770] primary dev: ETH0
I0320 20:34:43.423031 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:34:43.423045 543705 net.go:698] Add success.
I0320 20:34:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:34:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:34:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:34:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:34:53.409776 543705 memory.go:184] no items to output this cycle
I0320 20:34:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 20:35:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:35:03.409784 543705 memory.go:184] no items to output this cycle
I0320 20:35:03.409790 543705 cpu.go:275] no items to output this cycle
E0320 20:35:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:35:13.409789 543705 memory.go:191] Add success.
W0320 20:35:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:35:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:35:13.409826 543705 cpu.go:282] Add success.
I0320 20:35:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:35:13.420331 543705 net.go:648] Add success.
I0320 20:35:13.423096 543705 net.go:770] primary dev: ETH0
I0320 20:35:13.423110 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:35:13.423127 543705 net.go:698] Add success.
I0320 20:35:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:35:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:35:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 20:35:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:35:14.456524 543705 disk_worker.go:494] system disk:vda1
I0320 20:35:14.456571 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:35:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:35:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:35:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:35:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:35:16.472404 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:35:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:35:23.409795 543705 memory.go:184] no items to output this cycle
I0320 20:35:23.409807 543705 cpu.go:275] no items to output this cycle
I0320 20:35:26.017669 543705 disk_info.go:125] begin check local disk info of client
I0320 20:35:26.020143 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:35:26.020149 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003d99c0 0xc0003d9a00]
E0320 20:35:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:35:33.409769 543705 memory.go:184] no items to output this cycle
I0320 20:35:33.409794 543705 cpu.go:275] no items to output this cycle
E0320 20:35:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:35:43.409817 543705 memory.go:191] Add success.
I0320 20:35:43.409826 543705 cpu.go:282] Add success.
I0320 20:35:43.420065 543705 net.go:648] Add success.
I0320 20:35:43.422825 543705 net.go:770] primary dev: ETH0
I0320 20:35:43.422841 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:35:43.422854 543705 net.go:698] Add success.
I0320 20:35:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:35:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:35:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:35:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:35:53.409799 543705 memory.go:184] no items to output this cycle
I0320 20:35:53.409812 543705 cpu.go:275] no items to output this cycle
E0320 20:36:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:36:03.409779 543705 memory.go:184] no items to output this cycle
I0320 20:36:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 20:36:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:36:13.409801 543705 memory.go:191] Add success.
I0320 20:36:13.409805 543705 cpu.go:282] Add success.
W0320 20:36:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:36:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:36:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:36:13.420206 543705 net.go:648] Add success.
I0320 20:36:13.423098 543705 net.go:770] primary dev: ETH0
I0320 20:36:13.423112 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:36:13.423124 543705 net.go:698] Add success.
I0320 20:36:13.566317 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"389539af-c54b-4cc3-be64-df4f83e8cbc1","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 20:36:13.566352 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 20:36:14.454673 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:36:14.454828 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:36:14.454889 543705 disk_worker.go:708] disk space is not compliant
W0320 20:36:14.454892 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:36:14.456248 543705 disk_worker.go:494] system disk:vda1
I0320 20:36:14.456280 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:36:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:36:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:36:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:36:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:36:16.472394 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:36:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:36:23.409782 543705 memory.go:184] no items to output this cycle
I0320 20:36:23.409788 543705 cpu.go:275] no items to output this cycle
I0320 20:36:26.021675 543705 disk_info.go:125] begin check local disk info of client
I0320 20:36:26.024115 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:36:26.024122 543705 disk_info.go:196] parse disk info done, disk is : [0xc000266a80 0xc000266ac0]
E0320 20:36:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:36:33.409781 543705 memory.go:184] no items to output this cycle
I0320 20:36:33.409788 543705 cpu.go:275] no items to output this cycle
I0320 20:36:38.614328 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 20:36:38.614334 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 20:36:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:36:43.410692 543705 memory.go:191] Add success.
I0320 20:36:43.409822 543705 cpu.go:282] Add success.
I0320 20:36:43.420430 543705 net.go:648] Add success.
I0320 20:36:43.423244 543705 net.go:770] primary dev: ETH0
I0320 20:36:43.423259 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:36:43.423272 543705 net.go:698] Add success.
I0320 20:36:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:36:46.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:36:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:36:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:36:53.409780 543705 memory.go:184] no items to output this cycle
I0320 20:36:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 20:37:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:37:03.409785 543705 memory.go:184] no items to output this cycle
I0320 20:37:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 20:37:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:37:13.409816 543705 memory.go:191] Add success.
I0320 20:37:13.409824 543705 cpu.go:282] Add success.
W0320 20:37:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:37:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:37:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:37:13.420132 543705 net.go:648] Add success.
I0320 20:37:13.422963 543705 net.go:770] primary dev: ETH0
I0320 20:37:13.422976 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:37:13.422988 543705 net.go:698] Add success.
I0320 20:37:13.453515 543705 event_worker.go:152] Polling the log file for events...
W0320 20:37:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:37:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0320 20:37:14.455178 543705 disk_worker.go:728] disk inode is not compliant
E0320 20:37:14.456922 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 20:37:14.456931 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 20:37:14.456937 543705 custom_config.go:64] query custom config with name: gpu
I0320 20:37:14.457006 543705 disk_worker.go:494] system disk:vda1
I0320 20:37:14.457037 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 20:37:15.456802 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 20:37:15.456810 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:37:16.457934 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 20:37:16.457934 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 20:37:16.457988 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:37:16.458148 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:37:16.472089 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:37:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:37:23.409773 543705 memory.go:184] no items to output this cycle
I0320 20:37:23.409784 543705 cpu.go:275] no items to output this cycle
I0320 20:37:26.025672 543705 disk_info.go:125] begin check local disk info of client
I0320 20:37:26.028103 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:37:26.028109 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aad40 0xc0001aad80]
E0320 20:37:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:37:33.409774 543705 memory.go:184] no items to output this cycle
I0320 20:37:33.409785 543705 cpu.go:275] no items to output this cycle
E0320 20:37:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:37:43.409816 543705 memory.go:191] Add success.
I0320 20:37:43.409827 543705 cpu.go:282] Add success.
I0320 20:37:43.420017 543705 net.go:648] Add success.
I0320 20:37:43.422798 543705 net.go:770] primary dev: ETH0
I0320 20:37:43.422813 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:37:43.422827 543705 net.go:698] Add success.
I0320 20:37:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:37:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:37:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:37:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:37:53.409801 543705 memory.go:184] no items to output this cycle
I0320 20:37:53.409814 543705 cpu.go:275] no items to output this cycle
E0320 20:38:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:38:03.409805 543705 memory.go:184] no items to output this cycle
I0320 20:38:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 20:38:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:38:13.409820 543705 memory.go:191] Add success.
I0320 20:38:13.409821 543705 cpu.go:282] Add success.
W0320 20:38:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:38:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:38:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:38:13.420196 543705 net.go:648] Add success.
I0320 20:38:13.422948 543705 net.go:770] primary dev: ETH0
I0320 20:38:13.422961 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:38:13.422975 543705 net.go:698] Add success.
I0320 20:38:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:38:14.455164 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:38:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0320 20:38:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:38:14.456523 543705 disk_worker.go:494] system disk:vda1
I0320 20:38:14.456569 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:38:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:38:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:38:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:38:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:38:16.472400 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:38:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:38:23.409797 543705 memory.go:184] no items to output this cycle
I0320 20:38:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 20:38:26.029671 543705 disk_info.go:125] begin check local disk info of client
I0320 20:38:26.032124 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:38:26.032130 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004b0140 0xc0004b0180]
E0320 20:38:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:38:33.409777 543705 memory.go:184] no items to output this cycle
I0320 20:38:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 20:38:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:38:43.409791 543705 memory.go:191] Add success.
I0320 20:38:43.409792 543705 cpu.go:282] Add success.
I0320 20:38:43.419893 543705 net.go:648] Add success.
I0320 20:38:43.422494 543705 net.go:770] primary dev: ETH0
I0320 20:38:43.422508 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:38:43.422522 543705 net.go:698] Add success.
I0320 20:38:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:38:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:38:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:38:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:38:53.409775 543705 memory.go:184] no items to output this cycle
I0320 20:38:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 20:39:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:39:03.409809 543705 memory.go:184] no items to output this cycle
I0320 20:39:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 20:39:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:39:13.409820 543705 memory.go:191] Add success.
I0320 20:39:13.409831 543705 cpu.go:282] Add success.
W0320 20:39:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:39:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:39:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:39:13.420133 543705 net.go:648] Add success.
I0320 20:39:13.423091 543705 net.go:770] primary dev: ETH0
I0320 20:39:13.423110 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:39:13.423127 543705 net.go:698] Add success.
I0320 20:39:13.469600 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"19362160-2ce4-41c3-a40c-2e7285775958","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 20:39:13.469634 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 20:39:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:39:14.455158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:39:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0320 20:39:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:39:14.456548 543705 disk_worker.go:494] system disk:vda1
I0320 20:39:14.456602 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:39:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:39:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:39:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:39:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:39:16.472404 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:39:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:39:23.409785 543705 memory.go:184] no items to output this cycle
I0320 20:39:23.409786 543705 cpu.go:275] no items to output this cycle
I0320 20:39:26.033670 543705 disk_info.go:125] begin check local disk info of client
I0320 20:39:26.036156 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:39:26.036162 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003813c0 0xc000381400]
E0320 20:39:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:39:33.409793 543705 memory.go:184] no items to output this cycle
I0320 20:39:33.409807 543705 cpu.go:275] no items to output this cycle
I0320 20:39:38.614779 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 20:39:38.614785 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 20:39:43.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:39:43.410673 543705 memory.go:191] Add success.
I0320 20:39:43.409816 543705 cpu.go:282] Add success.
I0320 20:39:43.420427 543705 net.go:648] Add success.
I0320 20:39:43.423387 543705 net.go:770] primary dev: ETH0
I0320 20:39:43.423400 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:39:43.423412 543705 net.go:698] Add success.
I0320 20:39:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:39:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:39:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:39:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:39:53.409769 543705 memory.go:184] no items to output this cycle
I0320 20:39:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 20:40:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:40:03.409778 543705 memory.go:184] no items to output this cycle
I0320 20:40:03.409787 543705 cpu.go:275] no items to output this cycle
W0320 20:40:13.409711 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:40:13.409728 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:40:13.409734 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:40:13.409804 543705 cpu.go:282] Add success.
E0320 20:40:13.409811 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:40:13.409829 543705 memory.go:191] Add success.
I0320 20:40:13.420546 543705 net.go:648] Add success.
I0320 20:40:13.423338 543705 net.go:770] primary dev: ETH0
I0320 20:40:13.423353 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:40:13.423368 543705 net.go:698] Add success.
I0320 20:40:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:40:14.455136 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:40:14.455213 543705 disk_worker.go:708] disk space is not compliant
W0320 20:40:14.455216 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:40:14.456608 543705 disk_worker.go:494] system disk:vda1
I0320 20:40:14.456638 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:40:15.456011 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:40:16.458019 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:40:16.458081 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:40:16.458102 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:40:16.472452 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:40:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:40:23.409776 543705 memory.go:184] no items to output this cycle
I0320 20:40:23.409779 543705 cpu.go:275] no items to output this cycle
I0320 20:40:26.037673 543705 disk_info.go:125] begin check local disk info of client
I0320 20:40:26.040146 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:40:26.040152 543705 disk_info.go:196] parse disk info done, disk is : [0xc000471a80 0xc000471ac0]
I0320 20:40:33.409777 543705 cpu.go:275] no items to output this cycle
E0320 20:40:33.409861 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:40:33.409873 543705 memory.go:184] no items to output this cycle
E0320 20:40:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:40:43.409780 543705 memory.go:191] Add success.
I0320 20:40:43.409803 543705 cpu.go:282] Add success.
I0320 20:40:43.419903 543705 net.go:648] Add success.
I0320 20:40:43.422681 543705 net.go:770] primary dev: ETH0
I0320 20:40:43.422696 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:40:43.422727 543705 net.go:698] Add success.
I0320 20:40:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:40:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:40:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:40:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:40:53.409800 543705 memory.go:184] no items to output this cycle
I0320 20:40:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 20:41:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:41:03.409789 543705 memory.go:184] no items to output this cycle
I0320 20:41:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 20:41:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:41:13.409795 543705 memory.go:191] Add success.
I0320 20:41:13.409796 543705 cpu.go:282] Add success.
W0320 20:41:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:41:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:41:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:41:13.420388 543705 net.go:648] Add success.
I0320 20:41:13.423160 543705 net.go:770] primary dev: ETH0
I0320 20:41:13.423174 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:41:13.423185 543705 net.go:698] Add success.
I0320 20:41:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:41:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:41:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 20:41:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:41:14.456576 543705 disk_worker.go:494] system disk:vda1
I0320 20:41:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:41:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:41:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:41:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:41:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:41:16.472428 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:41:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:41:23.409769 543705 memory.go:184] no items to output this cycle
I0320 20:41:23.409797 543705 cpu.go:275] no items to output this cycle
I0320 20:41:26.041672 543705 disk_info.go:125] begin check local disk info of client
I0320 20:41:26.044151 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:41:26.044157 543705 disk_info.go:196] parse disk info done, disk is : [0xc000368a80 0xc000368ac0]
E0320 20:41:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:41:33.409800 543705 memory.go:184] no items to output this cycle
I0320 20:41:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 20:41:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:41:43.409784 543705 memory.go:191] Add success.
I0320 20:41:43.409809 543705 cpu.go:282] Add success.
I0320 20:41:43.420014 543705 net.go:648] Add success.
I0320 20:41:43.422925 543705 net.go:770] primary dev: ETH0
I0320 20:41:43.422941 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:41:43.422959 543705 net.go:698] Add success.
I0320 20:41:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:41:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:41:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:41:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:41:53.409801 543705 memory.go:184] no items to output this cycle
I0320 20:41:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 20:42:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:42:03.409786 543705 memory.go:184] no items to output this cycle
I0320 20:42:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 20:42:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:42:13.409795 543705 memory.go:191] Add success.
W0320 20:42:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 20:42:13.409828 543705 cpu.go:282] Add success.
W0320 20:42:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:42:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:42:13.420141 543705 net.go:648] Add success.
I0320 20:42:13.422953 543705 net.go:770] primary dev: ETH0
I0320 20:42:13.422965 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:42:13.422978 543705 net.go:698] Add success.
I0320 20:42:13.470082 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b148ddac-b6a3-4d68-878d-35661d88f438","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 20:42:13.470114 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 20:42:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:42:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 20:42:14.455202 543705 disk_worker.go:728] disk inode is not compliant
E0320 20:42:14.455922 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 20:42:14.455930 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 20:42:14.455935 543705 custom_config.go:64] query custom config with name: gpu
I0320 20:42:14.456592 543705 disk_worker.go:494] system disk:vda1
I0320 20:42:14.456625 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 20:42:15.456857 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 20:42:15.456866 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:42:16.457952 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 20:42:16.457953 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 20:42:16.458006 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:42:16.458025 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:42:16.472334 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:42:23.410366 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:42:23.410384 543705 memory.go:184] no items to output this cycle
I0320 20:42:23.410411 543705 cpu.go:275] no items to output this cycle
I0320 20:42:26.045673 543705 disk_info.go:125] begin check local disk info of client
I0320 20:42:26.048116 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:42:26.048122 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002adb80 0xc0002adbc0]
E0320 20:42:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:42:33.409779 543705 memory.go:184] no items to output this cycle
I0320 20:42:33.409780 543705 cpu.go:275] no items to output this cycle
I0320 20:42:38.615775 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 20:42:38.615782 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 20:42:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:42:43.410644 543705 memory.go:191] Add success.
I0320 20:42:43.409822 543705 cpu.go:282] Add success.
I0320 20:42:43.420385 543705 net.go:648] Add success.
I0320 20:42:43.422935 543705 net.go:770] primary dev: ETH0
I0320 20:42:43.422950 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:42:43.422965 543705 net.go:698] Add success.
I0320 20:42:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:42:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:42:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:42:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:42:53.409764 543705 memory.go:184] no items to output this cycle
I0320 20:42:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 20:43:03.410300 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:43:03.410321 543705 memory.go:184] no items to output this cycle
I0320 20:43:03.410336 543705 cpu.go:275] no items to output this cycle
E0320 20:43:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:43:13.409821 543705 memory.go:191] Add success.
I0320 20:43:13.409836 543705 cpu.go:282] Add success.
W0320 20:43:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:43:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:43:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:43:13.420184 543705 net.go:648] Add success.
I0320 20:43:13.422716 543705 net.go:770] primary dev: ETH0
I0320 20:43:13.422731 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:43:13.422744 543705 net.go:698] Add success.
I0320 20:43:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:43:14.455132 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:43:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0320 20:43:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:43:14.456605 543705 disk_worker.go:494] system disk:vda1
I0320 20:43:14.456651 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:43:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:43:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:43:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:43:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:43:16.472393 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:43:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:43:23.409795 543705 memory.go:184] no items to output this cycle
I0320 20:43:23.409807 543705 cpu.go:275] no items to output this cycle
I0320 20:43:26.049670 543705 disk_info.go:125] begin check local disk info of client
I0320 20:43:26.052148 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:43:26.052154 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004b3600 0xc0004b3640]
E0320 20:43:33.409866 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:43:33.409887 543705 memory.go:184] no items to output this cycle
I0320 20:43:33.409967 543705 cpu.go:275] no items to output this cycle
E0320 20:43:43.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:43:43.409830 543705 memory.go:191] Add success.
I0320 20:43:43.409840 543705 cpu.go:282] Add success.
I0320 20:43:43.420127 543705 net.go:648] Add success.
I0320 20:43:43.422823 543705 net.go:770] primary dev: ETH0
I0320 20:43:43.422835 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:43:43.422848 543705 net.go:698] Add success.
I0320 20:43:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:43:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:43:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:43:53.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:43:53.409817 543705 memory.go:184] no items to output this cycle
I0320 20:43:53.409821 543705 cpu.go:275] no items to output this cycle
E0320 20:44:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:44:03.409792 543705 memory.go:184] no items to output this cycle
I0320 20:44:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 20:44:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:44:13.409808 543705 memory.go:191] Add success.
I0320 20:44:13.409823 543705 cpu.go:282] Add success.
W0320 20:44:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:44:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:44:13.409851 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:44:13.420160 543705 net.go:648] Add success.
I0320 20:44:13.423577 543705 net.go:770] primary dev: ETH0
I0320 20:44:13.423592 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:44:13.423607 543705 net.go:698] Add success.
I0320 20:44:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:44:14.455162 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:44:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0320 20:44:14.455176 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:44:14.456510 543705 disk_worker.go:494] system disk:vda1
I0320 20:44:14.456555 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:44:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:44:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:44:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:44:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:44:16.472384 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:44:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:44:23.409768 543705 memory.go:184] no items to output this cycle
I0320 20:44:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 20:44:26.053677 543705 disk_info.go:125] begin check local disk info of client
I0320 20:44:26.056131 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:44:26.056137 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003f0100 0xc0003f0140]
I0320 20:44:33.409922 543705 cpu.go:275] no items to output this cycle
E0320 20:44:33.409942 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:44:33.409960 543705 memory.go:184] no items to output this cycle
E0320 20:44:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:44:43.409826 543705 memory.go:191] Add success.
I0320 20:44:43.409829 543705 cpu.go:282] Add success.
I0320 20:44:43.419882 543705 net.go:648] Add success.
I0320 20:44:43.423206 543705 net.go:770] primary dev: ETH0
I0320 20:44:43.423218 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:44:43.423230 543705 net.go:698] Add success.
I0320 20:44:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:44:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:44:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:44:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:44:53.409809 543705 memory.go:184] no items to output this cycle
I0320 20:44:53.409820 543705 cpu.go:275] no items to output this cycle
E0320 20:45:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:45:03.409781 543705 memory.go:184] no items to output this cycle
I0320 20:45:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 20:45:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:45:13.409817 543705 memory.go:191] Add success.
I0320 20:45:13.409825 543705 cpu.go:282] Add success.
W0320 20:45:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:45:13.409877 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:45:13.409881 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:45:13.420177 543705 net.go:648] Add success.
I0320 20:45:13.423181 543705 net.go:770] primary dev: ETH0
I0320 20:45:13.423196 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:45:13.423210 543705 net.go:698] Add success.
I0320 20:45:13.469072 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5884ec59-4fc0-4b26-ad88-46b5a8c9792e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 20:45:13.469105 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 20:45:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:45:14.455130 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:45:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0320 20:45:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:45:14.456605 543705 disk_worker.go:494] system disk:vda1
I0320 20:45:14.456636 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:45:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:45:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:45:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:45:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:45:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:45:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:45:23.409766 543705 memory.go:184] no items to output this cycle
I0320 20:45:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 20:45:26.057674 543705 disk_info.go:125] begin check local disk info of client
I0320 20:45:26.060165 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:45:26.060171 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004d8900 0xc0004d8940]
E0320 20:45:33.409888 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:45:33.409909 543705 memory.go:184] no items to output this cycle
I0320 20:45:33.409978 543705 cpu.go:275] no items to output this cycle
I0320 20:45:38.616783 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 20:45:38.616789 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 20:45:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:45:43.410744 543705 memory.go:191] Add success.
I0320 20:45:43.409828 543705 cpu.go:282] Add success.
I0320 20:45:43.420554 543705 net.go:648] Add success.
I0320 20:45:43.423420 543705 net.go:770] primary dev: ETH0
I0320 20:45:43.423432 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:45:43.423444 543705 net.go:698] Add success.
I0320 20:45:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:45:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:45:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:45:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:45:53.409798 543705 memory.go:184] no items to output this cycle
I0320 20:45:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 20:46:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:46:03.409783 543705 memory.go:184] no items to output this cycle
I0320 20:46:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 20:46:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:46:13.409818 543705 memory.go:191] Add success.
I0320 20:46:13.409829 543705 cpu.go:282] Add success.
W0320 20:46:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:46:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:46:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:46:13.420187 543705 net.go:648] Add success.
I0320 20:46:13.422926 543705 net.go:770] primary dev: ETH0
I0320 20:46:13.422953 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:46:13.422966 543705 net.go:698] Add success.
I0320 20:46:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:46:14.455201 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:46:14.455213 543705 disk_worker.go:708] disk space is not compliant
W0320 20:46:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:46:14.456593 543705 disk_worker.go:494] system disk:vda1
I0320 20:46:14.456622 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:46:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:46:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:46:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:46:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:46:16.472375 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:46:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:46:23.409775 543705 memory.go:184] no items to output this cycle
I0320 20:46:23.409786 543705 cpu.go:275] no items to output this cycle
I0320 20:46:26.061671 543705 disk_info.go:125] begin check local disk info of client
I0320 20:46:26.064124 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:46:26.064131 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a6800 0xc0004a6840]
I0320 20:46:33.409924 543705 cpu.go:275] no items to output this cycle
E0320 20:46:33.409993 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:46:33.410016 543705 memory.go:184] no items to output this cycle
E0320 20:46:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:46:43.409791 543705 memory.go:191] Add success.
I0320 20:46:43.409793 543705 cpu.go:282] Add success.
I0320 20:46:43.420015 543705 net.go:648] Add success.
I0320 20:46:43.422672 543705 net.go:770] primary dev: ETH0
I0320 20:46:43.422684 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:46:43.422697 543705 net.go:698] Add success.
I0320 20:46:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:46:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:46:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:46:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:46:53.409770 543705 memory.go:184] no items to output this cycle
I0320 20:46:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 20:47:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:47:03.409785 543705 memory.go:184] no items to output this cycle
I0320 20:47:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 20:47:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:47:13.409794 543705 memory.go:191] Add success.
I0320 20:47:13.409798 543705 cpu.go:282] Add success.
W0320 20:47:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:47:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:47:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:47:13.420082 543705 net.go:648] Add success.
I0320 20:47:13.422923 543705 net.go:770] primary dev: ETH0
I0320 20:47:13.422937 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:47:13.422949 543705 net.go:698] Add success.
I0320 20:47:13.453495 543705 event_worker.go:152] Polling the log file for events...
W0320 20:47:14.455128 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:47:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0320 20:47:14.455195 543705 disk_worker.go:728] disk inode is not compliant
E0320 20:47:14.456970 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 20:47:14.456979 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 20:47:14.456985 543705 custom_config.go:64] query custom config with name: gpu
I0320 20:47:14.457028 543705 disk_worker.go:494] system disk:vda1
I0320 20:47:14.457068 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 20:47:15.456803 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 20:47:15.456812 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:47:16.457908 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 20:47:16.457908 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 20:47:16.457966 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:47:16.457985 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:47:16.472324 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:47:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:47:23.409794 543705 memory.go:184] no items to output this cycle
I0320 20:47:23.409806 543705 cpu.go:275] no items to output this cycle
I0320 20:47:26.065677 543705 disk_info.go:125] begin check local disk info of client
I0320 20:47:26.068117 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:47:26.068124 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ae500 0xc0003ae540]
E0320 20:47:33.409852 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:47:33.409874 543705 memory.go:184] no items to output this cycle
I0320 20:47:33.409878 543705 cpu.go:275] no items to output this cycle
E0320 20:47:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:47:43.409786 543705 memory.go:191] Add success.
I0320 20:47:43.409797 543705 cpu.go:282] Add success.
I0320 20:47:43.419895 543705 net.go:648] Add success.
I0320 20:47:43.422473 543705 net.go:770] primary dev: ETH0
I0320 20:47:43.422486 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:47:43.422497 543705 net.go:698] Add success.
I0320 20:47:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:47:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:47:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:47:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:47:53.409798 543705 memory.go:184] no items to output this cycle
I0320 20:47:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 20:48:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:48:03.409784 543705 memory.go:184] no items to output this cycle
I0320 20:48:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 20:48:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:48:13.409825 543705 memory.go:191] Add success.
I0320 20:48:13.409828 543705 cpu.go:282] Add success.
W0320 20:48:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:48:13.409875 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:48:13.409879 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:48:13.420333 543705 net.go:648] Add success.
I0320 20:48:13.423623 543705 net.go:770] primary dev: ETH0
I0320 20:48:13.423639 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:48:13.423653 543705 net.go:698] Add success.
I0320 20:48:13.469289 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8ff82b88-bdba-46e1-8430-3fa0b99b1cf3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 20:48:13.469322 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 20:48:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:48:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:48:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 20:48:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:48:14.456757 543705 disk_worker.go:494] system disk:vda1
I0320 20:48:14.456795 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:48:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:48:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:48:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:48:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:48:16.472399 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:48:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:48:23.409776 543705 memory.go:184] no items to output this cycle
I0320 20:48:23.409779 543705 cpu.go:275] no items to output this cycle
I0320 20:48:26.069673 543705 disk_info.go:125] begin check local disk info of client
I0320 20:48:26.072181 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:48:26.072188 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004fd480 0xc0004fd4c0]
E0320 20:48:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:48:33.409799 543705 memory.go:184] no items to output this cycle
I0320 20:48:33.409812 543705 cpu.go:275] no items to output this cycle
I0320 20:48:38.617733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 20:48:38.617739 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 20:48:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:48:43.410678 543705 memory.go:191] Add success.
I0320 20:48:43.409797 543705 cpu.go:282] Add success.
I0320 20:48:43.420362 543705 net.go:648] Add success.
I0320 20:48:43.423065 543705 net.go:770] primary dev: ETH0
I0320 20:48:43.423078 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:48:43.423090 543705 net.go:698] Add success.
I0320 20:48:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:48:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:48:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:48:53.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:48:53.409802 543705 memory.go:184] no items to output this cycle
I0320 20:48:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 20:49:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:49:03.409798 543705 memory.go:184] no items to output this cycle
I0320 20:49:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 20:49:13.409804 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:49:13.409838 543705 memory.go:191] Add success.
I0320 20:49:13.409848 543705 cpu.go:282] Add success.
W0320 20:49:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:49:13.409889 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:49:13.409893 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:49:13.420202 543705 net.go:648] Add success.
I0320 20:49:13.423081 543705 net.go:770] primary dev: ETH0
I0320 20:49:13.423095 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:49:13.423109 543705 net.go:698] Add success.
I0320 20:49:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:49:14.455116 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:49:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0320 20:49:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:49:14.456531 543705 disk_worker.go:494] system disk:vda1
I0320 20:49:14.456575 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:49:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:49:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:49:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:49:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:49:16.472094 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:49:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:49:23.409788 543705 memory.go:184] no items to output this cycle
I0320 20:49:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 20:49:26.073679 543705 disk_info.go:125] begin check local disk info of client
I0320 20:49:26.076122 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:49:26.076128 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5b80 0xc0000c5bc0]
E0320 20:49:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:49:33.409798 543705 memory.go:184] no items to output this cycle
I0320 20:49:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 20:49:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:49:43.409791 543705 memory.go:191] Add success.
I0320 20:49:43.409808 543705 cpu.go:282] Add success.
I0320 20:49:43.419866 543705 net.go:648] Add success.
I0320 20:49:43.423453 543705 net.go:770] primary dev: ETH0
I0320 20:49:43.423467 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:49:43.423482 543705 net.go:698] Add success.
I0320 20:49:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:49:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:49:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:49:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:49:53.409778 543705 memory.go:184] no items to output this cycle
I0320 20:49:53.409777 543705 cpu.go:275] no items to output this cycle
E0320 20:50:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:50:03.409809 543705 memory.go:184] no items to output this cycle
I0320 20:50:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 20:50:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:50:13.409818 543705 memory.go:191] Add success.
I0320 20:50:13.409825 543705 cpu.go:282] Add success.
W0320 20:50:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:50:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:50:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:50:13.420322 543705 net.go:648] Add success.
I0320 20:50:13.423482 543705 net.go:770] primary dev: ETH0
I0320 20:50:13.423495 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:50:13.423507 543705 net.go:698] Add success.
I0320 20:50:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:50:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:50:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0320 20:50:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:50:14.456522 543705 disk_worker.go:494] system disk:vda1
I0320 20:50:14.456566 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:50:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:50:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:50:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:50:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:50:16.472387 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:50:23.409817 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:50:23.409840 543705 memory.go:184] no items to output this cycle
I0320 20:50:23.409909 543705 cpu.go:275] no items to output this cycle
I0320 20:50:26.077679 543705 disk_info.go:125] begin check local disk info of client
I0320 20:50:26.080126 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:50:26.080133 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa840 0xc0001fa880]
E0320 20:50:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:50:33.409779 543705 memory.go:184] no items to output this cycle
I0320 20:50:33.409791 543705 cpu.go:275] no items to output this cycle
E0320 20:50:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:50:43.409823 543705 memory.go:191] Add success.
I0320 20:50:43.409827 543705 cpu.go:282] Add success.
I0320 20:50:43.419897 543705 net.go:648] Add success.
I0320 20:50:43.422736 543705 net.go:770] primary dev: ETH0
I0320 20:50:43.422751 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:50:43.422766 543705 net.go:698] Add success.
I0320 20:50:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:50:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:50:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:50:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:50:53.409794 543705 memory.go:184] no items to output this cycle
I0320 20:50:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 20:51:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:51:03.409772 543705 memory.go:184] no items to output this cycle
I0320 20:51:03.409808 543705 cpu.go:275] no items to output this cycle
E0320 20:51:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:51:13.409818 543705 memory.go:191] Add success.
I0320 20:51:13.409825 543705 cpu.go:282] Add success.
W0320 20:51:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:51:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:51:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:51:13.420135 543705 net.go:648] Add success.
I0320 20:51:13.423166 543705 net.go:770] primary dev: ETH0
I0320 20:51:13.423182 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:51:13.423196 543705 net.go:698] Add success.
I0320 20:51:13.470110 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"31389829-504a-48a1-893d-516900ac640d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 20:51:13.470145 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 20:51:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:51:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:51:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 20:51:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:51:14.456602 543705 disk_worker.go:494] system disk:vda1
I0320 20:51:14.456717 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:51:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:51:16.457998 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:51:16.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:51:16.458080 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:51:16.472408 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:51:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:51:23.409771 543705 memory.go:184] no items to output this cycle
I0320 20:51:23.409788 543705 cpu.go:275] no items to output this cycle
I0320 20:51:26.081675 543705 disk_info.go:125] begin check local disk info of client
I0320 20:51:26.084107 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:51:26.084114 543705 disk_info.go:196] parse disk info done, disk is : [0xc000470d80 0xc000470dc0]
E0320 20:51:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:51:33.409800 543705 memory.go:184] no items to output this cycle
I0320 20:51:33.409812 543705 cpu.go:275] no items to output this cycle
I0320 20:51:38.618797 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 20:51:38.618804 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 20:51:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:51:43.410662 543705 memory.go:191] Add success.
I0320 20:51:43.409824 543705 cpu.go:282] Add success.
I0320 20:51:43.420345 543705 net.go:648] Add success.
I0320 20:51:43.423415 543705 net.go:770] primary dev: ETH0
I0320 20:51:43.423439 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:51:43.423453 543705 net.go:698] Add success.
I0320 20:51:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:51:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:51:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:51:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:51:53.409776 543705 memory.go:184] no items to output this cycle
I0320 20:51:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 20:52:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:52:03.409807 543705 memory.go:184] no items to output this cycle
I0320 20:52:03.409815 543705 cpu.go:275] no items to output this cycle
E0320 20:52:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:52:13.409789 543705 memory.go:191] Add success.
I0320 20:52:13.409814 543705 cpu.go:282] Add success.
W0320 20:52:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:52:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:52:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:52:13.420218 543705 net.go:648] Add success.
I0320 20:52:13.422936 543705 net.go:770] primary dev: ETH0
I0320 20:52:13.422949 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:52:13.422960 543705 net.go:698] Add success.
W0320 20:52:14.455165 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:52:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0320 20:52:14.455179 543705 disk_worker.go:728] disk inode is not compliant
E0320 20:52:14.455855 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 20:52:14.455864 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 20:52:14.455870 543705 custom_config.go:64] query custom config with name: gpu
I0320 20:52:14.456615 543705 disk_worker.go:494] system disk:vda1
I0320 20:52:14.456657 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 20:52:15.456873 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 20:52:15.456882 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:52:16.457933 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 20:52:16.457943 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 20:52:16.457984 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:52:16.458000 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:52:16.472315 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:52:23.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:52:23.409768 543705 memory.go:184] no items to output this cycle
I0320 20:52:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 20:52:26.085674 543705 disk_info.go:125] begin check local disk info of client
I0320 20:52:26.088147 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:52:26.088154 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fbe40 0xc0001fbe80]
E0320 20:52:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:52:33.409766 543705 memory.go:184] no items to output this cycle
I0320 20:52:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 20:52:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:52:43.409788 543705 memory.go:191] Add success.
I0320 20:52:43.409806 543705 cpu.go:282] Add success.
I0320 20:52:43.419842 543705 net.go:648] Add success.
I0320 20:52:43.422638 543705 net.go:770] primary dev: ETH0
I0320 20:52:43.422654 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:52:43.422669 543705 net.go:698] Add success.
I0320 20:52:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:52:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:52:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:52:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:52:53.409799 543705 memory.go:184] no items to output this cycle
I0320 20:52:53.409814 543705 cpu.go:275] no items to output this cycle
E0320 20:53:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:53:03.409783 543705 memory.go:184] no items to output this cycle
I0320 20:53:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 20:53:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:53:13.409787 543705 memory.go:191] Add success.
I0320 20:53:13.409792 543705 cpu.go:282] Add success.
W0320 20:53:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:53:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:53:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:53:13.420315 543705 net.go:648] Add success.
I0320 20:53:13.422958 543705 net.go:770] primary dev: ETH0
I0320 20:53:13.422970 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:53:13.422982 543705 net.go:698] Add success.
I0320 20:53:14.454947 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:53:14.455169 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:53:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 20:53:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:53:14.456544 543705 disk_worker.go:494] system disk:vda1
I0320 20:53:14.456572 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:53:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:53:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:53:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:53:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:53:16.472438 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:53:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:53:23.409797 543705 memory.go:184] no items to output this cycle
I0320 20:53:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 20:53:26.089672 543705 disk_info.go:125] begin check local disk info of client
I0320 20:53:26.092256 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:53:26.092262 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fbbc0 0xc0001fbc00]
E0320 20:53:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:53:33.409765 543705 memory.go:184] no items to output this cycle
I0320 20:53:33.409795 543705 cpu.go:275] no items to output this cycle
E0320 20:53:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:53:43.409815 543705 memory.go:191] Add success.
I0320 20:53:43.409818 543705 cpu.go:282] Add success.
I0320 20:53:43.419954 543705 net.go:648] Add success.
I0320 20:53:43.422674 543705 net.go:770] primary dev: ETH0
I0320 20:53:43.422689 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:53:43.422704 543705 net.go:698] Add success.
I0320 20:53:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:53:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:53:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:53:53.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:53:53.409815 543705 memory.go:184] no items to output this cycle
I0320 20:53:53.409825 543705 cpu.go:275] no items to output this cycle
E0320 20:54:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:54:03.409792 543705 memory.go:184] no items to output this cycle
I0320 20:54:03.409795 543705 cpu.go:275] no items to output this cycle
W0320 20:54:13.409714 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:54:13.409731 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:54:13.409736 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0320 20:54:13.409940 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:54:13.409964 543705 memory.go:191] Add success.
I0320 20:54:13.409973 543705 cpu.go:282] Add success.
I0320 20:54:13.419713 543705 net.go:648] Add success.
I0320 20:54:13.422389 543705 net.go:770] primary dev: ETH0
I0320 20:54:13.422402 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:54:13.422413 543705 net.go:698] Add success.
I0320 20:54:13.468551 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"00d99631-3db6-4f6f-8bdd-e334fa8f36b6","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 20:54:13.468582 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 20:54:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:54:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:54:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 20:54:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:54:14.456678 543705 disk_worker.go:494] system disk:vda1
I0320 20:54:14.456734 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:54:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:54:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:54:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:54:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:54:16.472397 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:54:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:54:23.409807 543705 memory.go:184] no items to output this cycle
I0320 20:54:23.409819 543705 cpu.go:275] no items to output this cycle
I0320 20:54:26.093671 543705 disk_info.go:125] begin check local disk info of client
I0320 20:54:26.096201 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:54:26.096208 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002738c0 0xc000273900]
E0320 20:54:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:54:33.409796 543705 memory.go:184] no items to output this cycle
I0320 20:54:33.409798 543705 cpu.go:275] no items to output this cycle
I0320 20:54:38.619802 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 20:54:38.619809 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 20:54:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:54:43.410603 543705 memory.go:191] Add success.
I0320 20:54:43.409812 543705 cpu.go:282] Add success.
I0320 20:54:43.420362 543705 net.go:648] Add success.
I0320 20:54:43.423097 543705 net.go:770] primary dev: ETH0
I0320 20:54:43.423114 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:54:43.423128 543705 net.go:698] Add success.
I0320 20:54:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:54:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:54:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:54:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:54:53.409789 543705 memory.go:184] no items to output this cycle
I0320 20:54:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 20:55:03.409856 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:55:03.409880 543705 memory.go:184] no items to output this cycle
I0320 20:55:03.409940 543705 cpu.go:275] no items to output this cycle
E0320 20:55:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:55:13.409803 543705 memory.go:191] Add success.
I0320 20:55:13.409809 543705 cpu.go:282] Add success.
W0320 20:55:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:55:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:55:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:55:13.420113 543705 net.go:648] Add success.
I0320 20:55:13.423405 543705 net.go:770] primary dev: ETH0
I0320 20:55:13.423418 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:55:13.423430 543705 net.go:698] Add success.
I0320 20:55:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:55:14.455180 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:55:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0320 20:55:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:55:14.456603 543705 disk_worker.go:494] system disk:vda1
I0320 20:55:14.456635 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:55:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:55:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:55:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:55:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:55:16.472368 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:55:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:55:23.409770 543705 memory.go:184] no items to output this cycle
I0320 20:55:23.409789 543705 cpu.go:275] no items to output this cycle
I0320 20:55:26.097672 543705 disk_info.go:125] begin check local disk info of client
I0320 20:55:26.100128 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:55:26.100134 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004717c0 0xc000471800]
E0320 20:55:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:55:33.409783 543705 memory.go:184] no items to output this cycle
I0320 20:55:33.409798 543705 cpu.go:275] no items to output this cycle
E0320 20:55:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:55:43.409812 543705 memory.go:191] Add success.
I0320 20:55:43.409821 543705 cpu.go:282] Add success.
I0320 20:55:43.419876 543705 net.go:648] Add success.
I0320 20:55:43.422426 543705 net.go:770] primary dev: ETH0
I0320 20:55:43.422440 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:55:43.422453 543705 net.go:698] Add success.
I0320 20:55:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:55:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:55:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:55:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:55:53.409779 543705 cpu.go:275] no items to output this cycle
I0320 20:55:53.409782 543705 memory.go:184] no items to output this cycle
E0320 20:56:03.409866 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:56:03.409891 543705 memory.go:184] no items to output this cycle
I0320 20:56:03.409939 543705 cpu.go:275] no items to output this cycle
E0320 20:56:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:56:13.409794 543705 memory.go:191] Add success.
I0320 20:56:13.409811 543705 cpu.go:282] Add success.
W0320 20:56:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:56:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:56:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:56:13.420185 543705 net.go:648] Add success.
I0320 20:56:13.422995 543705 net.go:770] primary dev: ETH0
I0320 20:56:13.423011 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:56:13.423025 543705 net.go:698] Add success.
I0320 20:56:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:56:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:56:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0320 20:56:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:56:14.456611 543705 disk_worker.go:494] system disk:vda1
I0320 20:56:14.456641 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:56:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:56:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:56:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:56:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:56:16.472396 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:56:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:56:23.409803 543705 memory.go:184] no items to output this cycle
I0320 20:56:23.409812 543705 cpu.go:275] no items to output this cycle
I0320 20:56:26.101676 543705 disk_info.go:125] begin check local disk info of client
I0320 20:56:26.104207 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:56:26.104213 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b8c0 0xc00007b900]
E0320 20:56:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:56:33.409765 543705 memory.go:184] no items to output this cycle
I0320 20:56:33.409802 543705 cpu.go:275] no items to output this cycle
E0320 20:56:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:56:43.409810 543705 memory.go:191] Add success.
I0320 20:56:43.409818 543705 cpu.go:282] Add success.
I0320 20:56:43.420019 543705 net.go:648] Add success.
I0320 20:56:43.422512 543705 net.go:770] primary dev: ETH0
I0320 20:56:43.422526 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:56:43.422541 543705 net.go:698] Add success.
I0320 20:56:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:56:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:56:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:56:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:56:53.409782 543705 memory.go:184] no items to output this cycle
I0320 20:56:53.409798 543705 cpu.go:275] no items to output this cycle
I0320 20:57:03.409912 543705 cpu.go:275] no items to output this cycle
E0320 20:57:03.409912 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:57:03.409933 543705 memory.go:184] no items to output this cycle
E0320 20:57:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:57:13.409816 543705 memory.go:191] Add success.
I0320 20:57:13.409827 543705 cpu.go:282] Add success.
W0320 20:57:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:57:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:57:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:57:13.420495 543705 net.go:648] Add success.
I0320 20:57:13.428951 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 20:57:13.429029 543705 net.go:770] primary dev: ETH0
I0320 20:57:13.429040 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:57:13.429051 543705 net.go:698] Add success.
I0320 20:57:13.453602 543705 event_worker.go:152] Polling the log file for events...
I0320 20:57:13.464730 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"04a35aa3-8fb9-49e1-abad-364bd3909b9e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 20:57:13.464765 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 20:57:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:57:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 20:57:14.455191 543705 disk_worker.go:728] disk inode is not compliant
E0320 20:57:14.456099 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 20:57:14.456109 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 20:57:14.456115 543705 custom_config.go:64] query custom config with name: gpu
I0320 20:57:14.456473 543705 disk_worker.go:494] system disk:vda1
I0320 20:57:14.456502 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 20:57:15.456827 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 20:57:15.456837 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:57:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 20:57:16.457981 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 20:57:16.458025 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:57:16.458043 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:57:16.472375 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:57:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:57:23.409791 543705 memory.go:184] no items to output this cycle
I0320 20:57:23.409801 543705 cpu.go:275] no items to output this cycle
I0320 20:57:26.105665 543705 disk_info.go:125] begin check local disk info of client
I0320 20:57:26.108102 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:57:26.108108 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b400 0xc00007b440]
E0320 20:57:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:57:33.409799 543705 memory.go:184] no items to output this cycle
I0320 20:57:33.409812 543705 cpu.go:275] no items to output this cycle
I0320 20:57:38.619954 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 20:57:38.619961 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 20:57:43.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:57:43.410638 543705 memory.go:191] Add success.
I0320 20:57:43.409837 543705 cpu.go:282] Add success.
I0320 20:57:43.420421 543705 net.go:648] Add success.
I0320 20:57:43.423268 543705 net.go:770] primary dev: ETH0
I0320 20:57:43.423282 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:57:43.423296 543705 net.go:698] Add success.
I0320 20:57:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:57:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:57:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:57:53.409888 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:57:53.409908 543705 memory.go:184] no items to output this cycle
I0320 20:57:53.409965 543705 cpu.go:275] no items to output this cycle
E0320 20:58:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:58:03.409795 543705 memory.go:184] no items to output this cycle
I0320 20:58:03.409803 543705 cpu.go:275] no items to output this cycle
E0320 20:58:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:58:13.409830 543705 memory.go:191] Add success.
I0320 20:58:13.409843 543705 cpu.go:282] Add success.
W0320 20:58:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:58:13.409881 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:58:13.409885 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:58:13.420146 543705 net.go:648] Add success.
I0320 20:58:13.422985 543705 net.go:770] primary dev: ETH0
I0320 20:58:13.422998 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:58:13.423009 543705 net.go:698] Add success.
I0320 20:58:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:58:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:58:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0320 20:58:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:58:14.456509 543705 disk_worker.go:494] system disk:vda1
I0320 20:58:14.456569 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:58:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:58:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:58:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:58:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:58:16.472390 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:58:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:58:23.409797 543705 memory.go:184] no items to output this cycle
I0320 20:58:23.409811 543705 cpu.go:275] no items to output this cycle
I0320 20:58:26.109678 543705 disk_info.go:125] begin check local disk info of client
I0320 20:58:26.112133 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:58:26.112139 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003c8ec0 0xc0003c8f00]
E0320 20:58:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:58:33.409778 543705 memory.go:184] no items to output this cycle
I0320 20:58:33.409790 543705 cpu.go:275] no items to output this cycle
E0320 20:58:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:58:43.409785 543705 memory.go:191] Add success.
I0320 20:58:43.409798 543705 cpu.go:282] Add success.
I0320 20:58:43.419867 543705 net.go:648] Add success.
I0320 20:58:43.422537 543705 net.go:770] primary dev: ETH0
I0320 20:58:43.422549 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:58:43.422576 543705 net.go:698] Add success.
I0320 20:58:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:58:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:58:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:58:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:58:53.409802 543705 memory.go:184] no items to output this cycle
I0320 20:58:53.409815 543705 cpu.go:275] no items to output this cycle
E0320 20:59:03.409906 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:59:03.410011 543705 memory.go:184] no items to output this cycle
I0320 20:59:03.410021 543705 cpu.go:275] no items to output this cycle
E0320 20:59:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:59:13.409799 543705 memory.go:191] Add success.
I0320 20:59:13.409806 543705 cpu.go:282] Add success.
W0320 20:59:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 20:59:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 20:59:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 20:59:13.420055 543705 net.go:648] Add success.
I0320 20:59:13.422877 543705 net.go:770] primary dev: ETH0
I0320 20:59:13.422896 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:59:13.422910 543705 net.go:698] Add success.
I0320 20:59:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 20:59:14.455197 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 20:59:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0320 20:59:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0320 20:59:14.456575 543705 disk_worker.go:494] system disk:vda1
I0320 20:59:14.456603 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 20:59:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 20:59:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:59:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:59:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 20:59:16.472452 543705 disk_local_worker.go:436] Get disk info: []
E0320 20:59:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:59:23.409795 543705 memory.go:184] no items to output this cycle
I0320 20:59:23.409806 543705 cpu.go:275] no items to output this cycle
I0320 20:59:26.113673 543705 disk_info.go:125] begin check local disk info of client
I0320 20:59:26.116239 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 20:59:26.116245 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fbbc0 0xc0001fbc00]
E0320 20:59:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:59:33.409805 543705 memory.go:184] no items to output this cycle
I0320 20:59:33.409823 543705 cpu.go:275] no items to output this cycle
E0320 20:59:43.409804 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:59:43.409838 543705 memory.go:191] Add success.
I0320 20:59:43.409849 543705 cpu.go:282] Add success.
I0320 20:59:43.420076 543705 net.go:648] Add success.
I0320 20:59:43.422857 543705 net.go:770] primary dev: ETH0
I0320 20:59:43.422871 543705 net.go:802] Send network stats successfully!,count is 6
I0320 20:59:43.422884 543705 net.go:698] Add success.
I0320 20:59:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 20:59:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 20:59:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 20:59:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 20:59:53.409800 543705 memory.go:184] no items to output this cycle
I0320 20:59:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 21:00:03.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:00:03.409815 543705 memory.go:184] no items to output this cycle
I0320 21:00:03.409828 543705 cpu.go:275] no items to output this cycle
E0320 21:00:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:00:13.409800 543705 memory.go:191] Add success.
W0320 21:00:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 21:00:13.409837 543705 cpu.go:282] Add success.
W0320 21:00:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:00:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:00:13.420180 543705 net.go:648] Add success.
I0320 21:00:13.422784 543705 net.go:770] primary dev: ETH0
I0320 21:00:13.422799 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:00:13.422814 543705 net.go:698] Add success.
I0320 21:00:13.468699 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"cb474ff5-f735-46df-bde9-c9e7b436d36f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 21:00:13.468743 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 21:00:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:00:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:00:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 21:00:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:00:14.456660 543705 disk_worker.go:494] system disk:vda1
I0320 21:00:14.456690 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:00:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:00:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:00:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:00:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:00:16.472451 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:00:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:00:23.409781 543705 memory.go:184] no items to output this cycle
I0320 21:00:23.409799 543705 cpu.go:275] no items to output this cycle
I0320 21:00:26.117676 543705 disk_info.go:125] begin check local disk info of client
I0320 21:00:26.120125 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:00:26.120131 543705 disk_info.go:196] parse disk info done, disk is : [0xc000471d80 0xc000471dc0]
E0320 21:00:33.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:00:33.409773 543705 memory.go:184] no items to output this cycle
I0320 21:00:33.409807 543705 cpu.go:275] no items to output this cycle
I0320 21:00:38.620100 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 21:00:38.620107 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 21:00:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:00:43.410563 543705 memory.go:191] Add success.
I0320 21:00:43.409799 543705 cpu.go:282] Add success.
I0320 21:00:43.420280 543705 net.go:648] Add success.
I0320 21:00:43.423377 543705 net.go:770] primary dev: ETH0
I0320 21:00:43.423390 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:00:43.423403 543705 net.go:698] Add success.
I0320 21:00:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:00:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:00:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:00:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:00:53.409785 543705 memory.go:184] no items to output this cycle
I0320 21:00:53.409813 543705 cpu.go:275] no items to output this cycle
I0320 21:01:03.409909 543705 cpu.go:275] no items to output this cycle
E0320 21:01:03.409938 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:01:03.409973 543705 memory.go:184] no items to output this cycle
E0320 21:01:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:01:13.409795 543705 memory.go:191] Add success.
I0320 21:01:13.409811 543705 cpu.go:282] Add success.
W0320 21:01:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:01:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:01:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:01:13.420171 543705 net.go:648] Add success.
I0320 21:01:13.422816 543705 net.go:770] primary dev: ETH0
I0320 21:01:13.422841 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:01:13.422854 543705 net.go:698] Add success.
I0320 21:01:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:01:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:01:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0320 21:01:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:01:14.456490 543705 disk_worker.go:494] system disk:vda1
I0320 21:01:14.456535 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:01:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:01:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:01:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:01:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:01:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:01:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:01:23.409798 543705 memory.go:184] no items to output this cycle
I0320 21:01:23.409808 543705 cpu.go:275] no items to output this cycle
I0320 21:01:26.121674 543705 disk_info.go:125] begin check local disk info of client
I0320 21:01:26.124120 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:01:26.124127 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5480 0xc0000c54c0]
E0320 21:01:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:01:33.409765 543705 memory.go:184] no items to output this cycle
I0320 21:01:33.409793 543705 cpu.go:275] no items to output this cycle
E0320 21:01:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:01:43.409786 543705 memory.go:191] Add success.
I0320 21:01:43.409791 543705 cpu.go:282] Add success.
I0320 21:01:43.419835 543705 net.go:648] Add success.
I0320 21:01:43.422446 543705 net.go:770] primary dev: ETH0
I0320 21:01:43.422459 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:01:43.422471 543705 net.go:698] Add success.
I0320 21:01:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:01:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:01:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:01:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:01:53.409807 543705 memory.go:184] no items to output this cycle
I0320 21:01:53.409817 543705 cpu.go:275] no items to output this cycle
E0320 21:02:03.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:02:03.409819 543705 memory.go:184] no items to output this cycle
I0320 21:02:03.409831 543705 cpu.go:275] no items to output this cycle
E0320 21:02:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:02:13.409801 543705 cpu.go:282] Add success.
I0320 21:02:13.409809 543705 memory.go:191] Add success.
W0320 21:02:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:02:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:02:13.409854 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:02:13.420091 543705 net.go:648] Add success.
I0320 21:02:13.423185 543705 net.go:770] primary dev: ETH0
I0320 21:02:13.423199 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:02:13.423213 543705 net.go:698] Add success.
W0320 21:02:14.455137 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:02:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0320 21:02:14.455206 543705 disk_worker.go:728] disk inode is not compliant
E0320 21:02:14.456995 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 21:02:14.457004 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 21:02:14.457010 543705 custom_config.go:64] query custom config with name: gpu
I0320 21:02:14.457042 543705 disk_worker.go:494] system disk:vda1
I0320 21:02:14.457083 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 21:02:15.456787 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 21:02:15.456797 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:02:16.457962 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 21:02:16.457962 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 21:02:16.458018 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:02:16.458040 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:02:16.472357 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:02:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:02:23.409795 543705 memory.go:184] no items to output this cycle
I0320 21:02:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 21:02:26.125672 543705 disk_info.go:125] begin check local disk info of client
I0320 21:02:26.128162 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:02:26.128169 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b9d00 0xc0003b9d40]
E0320 21:02:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:02:33.409796 543705 memory.go:184] no items to output this cycle
I0320 21:02:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 21:02:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:02:43.409822 543705 memory.go:191] Add success.
I0320 21:02:43.409835 543705 cpu.go:282] Add success.
I0320 21:02:43.419963 543705 net.go:648] Add success.
I0320 21:02:43.422388 543705 net.go:770] primary dev: ETH0
I0320 21:02:43.422403 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:02:43.422418 543705 net.go:698] Add success.
I0320 21:02:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:02:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:02:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:02:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:02:53.409784 543705 memory.go:184] no items to output this cycle
I0320 21:02:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 21:03:03.409871 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:03:03.409891 543705 memory.go:184] no items to output this cycle
I0320 21:03:03.409919 543705 cpu.go:275] no items to output this cycle
E0320 21:03:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:03:13.409790 543705 memory.go:191] Add success.
I0320 21:03:13.409805 543705 cpu.go:282] Add success.
W0320 21:03:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:03:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:03:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:03:13.420175 543705 net.go:648] Add success.
I0320 21:03:13.423012 543705 net.go:770] primary dev: ETH0
I0320 21:03:13.423024 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:03:13.423046 543705 net.go:698] Add success.
I0320 21:03:13.469367 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4f38816a-eec6-482c-9c42-1041c81c4208","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 21:03:13.469399 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 21:03:14.454978 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:03:14.455119 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:03:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0320 21:03:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:03:14.456713 543705 disk_worker.go:494] system disk:vda1
I0320 21:03:14.456743 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:03:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:03:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:03:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:03:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:03:16.472368 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:03:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:03:23.409777 543705 cpu.go:275] no items to output this cycle
I0320 21:03:23.409786 543705 memory.go:184] no items to output this cycle
I0320 21:03:26.129672 543705 disk_info.go:125] begin check local disk info of client
I0320 21:03:26.132232 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:03:26.132238 543705 disk_info.go:196] parse disk info done, disk is : [0xc000492600 0xc000492640]
E0320 21:03:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:03:33.409775 543705 memory.go:184] no items to output this cycle
I0320 21:03:33.409791 543705 cpu.go:275] no items to output this cycle
I0320 21:03:38.620247 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 21:03:38.620254 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 21:03:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:03:43.410659 543705 memory.go:191] Add success.
I0320 21:03:43.409805 543705 cpu.go:282] Add success.
I0320 21:03:43.420422 543705 net.go:648] Add success.
I0320 21:03:43.423594 543705 net.go:770] primary dev: ETH0
I0320 21:03:43.423605 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:03:43.423618 543705 net.go:698] Add success.
I0320 21:03:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:03:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:03:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:03:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:03:53.409787 543705 memory.go:184] no items to output this cycle
I0320 21:03:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 21:04:03.409859 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:04:03.409882 543705 memory.go:184] no items to output this cycle
I0320 21:04:03.409956 543705 cpu.go:275] no items to output this cycle
E0320 21:04:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:04:13.409795 543705 memory.go:191] Add success.
I0320 21:04:13.409805 543705 cpu.go:282] Add success.
W0320 21:04:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:04:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:04:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:04:13.420242 543705 net.go:648] Add success.
I0320 21:04:13.422911 543705 net.go:770] primary dev: ETH0
I0320 21:04:13.422924 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:04:13.422936 543705 net.go:698] Add success.
I0320 21:04:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:04:14.455108 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:04:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 21:04:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:04:14.456517 543705 disk_worker.go:494] system disk:vda1
I0320 21:04:14.456562 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:04:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:04:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:04:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:04:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:04:16.472453 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:04:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:04:23.409790 543705 memory.go:184] no items to output this cycle
I0320 21:04:23.409802 543705 cpu.go:275] no items to output this cycle
I0320 21:04:26.133677 543705 disk_info.go:125] begin check local disk info of client
I0320 21:04:26.136192 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:04:26.136197 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004b1080 0xc0004b10c0]
E0320 21:04:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:04:33.409775 543705 memory.go:184] no items to output this cycle
I0320 21:04:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 21:04:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:04:43.409819 543705 memory.go:191] Add success.
I0320 21:04:43.409830 543705 cpu.go:282] Add success.
I0320 21:04:43.419956 543705 net.go:648] Add success.
I0320 21:04:43.422585 543705 net.go:770] primary dev: ETH0
I0320 21:04:43.422598 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:04:43.422611 543705 net.go:698] Add success.
I0320 21:04:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:04:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:04:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:04:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:04:53.409869 543705 cpu.go:275] no items to output this cycle
I0320 21:04:53.409888 543705 memory.go:184] no items to output this cycle
E0320 21:05:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:05:03.409810 543705 memory.go:184] no items to output this cycle
I0320 21:05:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 21:05:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:05:13.409790 543705 memory.go:191] Add success.
I0320 21:05:13.409793 543705 cpu.go:282] Add success.
W0320 21:05:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:05:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:05:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:05:13.420167 543705 net.go:648] Add success.
I0320 21:05:13.423336 543705 net.go:770] primary dev: ETH0
I0320 21:05:13.423349 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:05:13.423363 543705 net.go:698] Add success.
I0320 21:05:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:05:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:05:14.455160 543705 disk_worker.go:708] disk space is not compliant
W0320 21:05:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:05:14.456521 543705 disk_worker.go:494] system disk:vda1
I0320 21:05:14.456566 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:05:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:05:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:05:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:05:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:05:16.472381 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:05:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:05:23.409769 543705 memory.go:184] no items to output this cycle
I0320 21:05:23.409801 543705 cpu.go:275] no items to output this cycle
I0320 21:05:26.137675 543705 disk_info.go:125] begin check local disk info of client
I0320 21:05:26.140099 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:05:26.140105 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032b8c0 0xc00032b900]
E0320 21:05:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:05:33.409793 543705 memory.go:184] no items to output this cycle
I0320 21:05:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 21:05:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:05:43.409790 543705 memory.go:191] Add success.
I0320 21:05:43.409793 543705 cpu.go:282] Add success.
I0320 21:05:43.419930 543705 net.go:648] Add success.
I0320 21:05:43.423144 543705 net.go:770] primary dev: ETH0
I0320 21:05:43.423156 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:05:43.423168 543705 net.go:698] Add success.
I0320 21:05:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:05:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:05:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:05:53.410334 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:05:53.410351 543705 memory.go:184] no items to output this cycle
I0320 21:05:53.410362 543705 cpu.go:275] no items to output this cycle
E0320 21:06:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:06:03.409801 543705 memory.go:184] no items to output this cycle
I0320 21:06:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 21:06:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:06:13.409792 543705 memory.go:191] Add success.
W0320 21:06:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:06:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:06:13.409831 543705 cpu.go:282] Add success.
I0320 21:06:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:06:13.420577 543705 net.go:648] Add success.
I0320 21:06:13.423476 543705 net.go:770] primary dev: ETH0
I0320 21:06:13.423494 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:06:13.423510 543705 net.go:698] Add success.
I0320 21:06:13.468388 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"28b60117-f0e8-4a33-a028-acbf568387a4","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 21:06:13.468423 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 21:06:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:06:14.455118 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:06:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 21:06:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:06:14.456631 543705 disk_worker.go:494] system disk:vda1
I0320 21:06:14.456663 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:06:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:06:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:06:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:06:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:06:16.472364 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:06:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:06:23.409779 543705 memory.go:184] no items to output this cycle
I0320 21:06:23.409784 543705 cpu.go:275] no items to output this cycle
I0320 21:06:26.141671 543705 disk_info.go:125] begin check local disk info of client
I0320 21:06:26.144096 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:06:26.144101 543705 disk_info.go:196] parse disk info done, disk is : [0xc000473380 0xc0004733c0]
E0320 21:06:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:06:33.409779 543705 memory.go:184] no items to output this cycle
I0320 21:06:33.409784 543705 cpu.go:275] no items to output this cycle
I0320 21:06:38.620394 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 21:06:38.620408 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 21:06:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:06:43.410583 543705 memory.go:191] Add success.
I0320 21:06:43.409806 543705 cpu.go:282] Add success.
I0320 21:06:43.420315 543705 net.go:648] Add success.
I0320 21:06:43.422950 543705 net.go:770] primary dev: ETH0
I0320 21:06:43.422965 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:06:43.422977 543705 net.go:698] Add success.
I0320 21:06:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:06:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:06:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:06:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:06:53.409800 543705 memory.go:184] no items to output this cycle
I0320 21:06:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 21:07:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:07:03.409794 543705 memory.go:184] no items to output this cycle
I0320 21:07:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 21:07:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:07:13.409780 543705 memory.go:191] Add success.
W0320 21:07:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:07:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:07:13.409819 543705 cpu.go:282] Add success.
I0320 21:07:13.409821 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:07:13.420125 543705 net.go:648] Add success.
I0320 21:07:13.422845 543705 net.go:770] primary dev: ETH0
I0320 21:07:13.422858 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:07:13.422870 543705 net.go:698] Add success.
I0320 21:07:13.453404 543705 event_worker.go:152] Polling the log file for events...
W0320 21:07:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:07:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0320 21:07:14.455183 543705 disk_worker.go:728] disk inode is not compliant
E0320 21:07:14.455878 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 21:07:14.455887 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 21:07:14.455892 543705 custom_config.go:64] query custom config with name: gpu
I0320 21:07:14.456555 543705 disk_worker.go:494] system disk:vda1
I0320 21:07:14.456585 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 21:07:15.456836 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 21:07:15.456845 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:07:16.457906 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 21:07:16.457906 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 21:07:16.457960 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:07:16.457979 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:07:16.472360 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:07:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:07:23.409795 543705 memory.go:184] no items to output this cycle
I0320 21:07:23.409805 543705 cpu.go:275] no items to output this cycle
I0320 21:07:26.145674 543705 disk_info.go:125] begin check local disk info of client
I0320 21:07:26.148124 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:07:26.148130 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a9280 0xc0002a92c0]
E0320 21:07:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:07:33.409796 543705 memory.go:184] no items to output this cycle
I0320 21:07:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 21:07:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:07:43.409801 543705 memory.go:191] Add success.
I0320 21:07:43.409801 543705 cpu.go:282] Add success.
I0320 21:07:43.419848 543705 net.go:648] Add success.
I0320 21:07:43.422521 543705 net.go:770] primary dev: ETH0
I0320 21:07:43.422534 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:07:43.422547 543705 net.go:698] Add success.
I0320 21:07:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:07:46.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:07:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:07:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:07:53.409770 543705 memory.go:184] no items to output this cycle
I0320 21:07:53.409780 543705 cpu.go:275] no items to output this cycle
E0320 21:08:03.409878 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:08:03.409898 543705 memory.go:184] no items to output this cycle
I0320 21:08:03.409977 543705 cpu.go:275] no items to output this cycle
E0320 21:08:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:08:13.409832 543705 memory.go:191] Add success.
I0320 21:08:13.409846 543705 cpu.go:282] Add success.
W0320 21:08:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:08:13.409882 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:08:13.409886 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:08:13.420156 543705 net.go:648] Add success.
I0320 21:08:13.422887 543705 net.go:770] primary dev: ETH0
I0320 21:08:13.422902 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:08:13.422916 543705 net.go:698] Add success.
I0320 21:08:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:08:14.455136 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:08:14.455216 543705 disk_worker.go:708] disk space is not compliant
W0320 21:08:14.455220 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:08:14.456597 543705 disk_worker.go:494] system disk:vda1
I0320 21:08:14.456627 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:08:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:08:16.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:08:16.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:08:16.458076 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:08:16.472404 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:08:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:08:23.409775 543705 memory.go:184] no items to output this cycle
I0320 21:08:23.409782 543705 cpu.go:275] no items to output this cycle
I0320 21:08:26.149673 543705 disk_info.go:125] begin check local disk info of client
I0320 21:08:26.152107 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:08:26.152113 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001abac0 0xc0001abb00]
E0320 21:08:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:08:33.409768 543705 memory.go:184] no items to output this cycle
I0320 21:08:33.409787 543705 cpu.go:275] no items to output this cycle
E0320 21:08:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:08:43.409818 543705 memory.go:191] Add success.
I0320 21:08:43.409821 543705 cpu.go:282] Add success.
I0320 21:08:43.419939 543705 net.go:648] Add success.
I0320 21:08:43.422771 543705 net.go:770] primary dev: ETH0
I0320 21:08:43.422786 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:08:43.422799 543705 net.go:698] Add success.
I0320 21:08:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:08:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:08:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:08:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:08:53.409790 543705 memory.go:184] no items to output this cycle
I0320 21:08:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 21:09:03.409862 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:09:03.409884 543705 memory.go:184] no items to output this cycle
I0320 21:09:03.409958 543705 cpu.go:275] no items to output this cycle
E0320 21:09:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:09:13.409787 543705 memory.go:191] Add success.
W0320 21:09:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 21:09:13.409814 543705 cpu.go:282] Add success.
W0320 21:09:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:09:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:09:13.420081 543705 net.go:648] Add success.
I0320 21:09:13.422619 543705 net.go:770] primary dev: ETH0
I0320 21:09:13.422632 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:09:13.422644 543705 net.go:698] Add success.
I0320 21:09:13.468635 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3f1a29c2-eae4-4179-804a-67f9f6ceac86","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 21:09:13.468680 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 21:09:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:09:14.455165 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:09:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0320 21:09:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:09:14.456536 543705 disk_worker.go:494] system disk:vda1
I0320 21:09:14.456581 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:09:15.455975 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:09:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:09:16.458023 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:09:16.458044 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:09:16.472366 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:09:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:09:23.409780 543705 memory.go:184] no items to output this cycle
I0320 21:09:23.409800 543705 cpu.go:275] no items to output this cycle
I0320 21:09:26.153675 543705 disk_info.go:125] begin check local disk info of client
I0320 21:09:26.156137 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:09:26.156143 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aad00 0xc0001aad40]
E0320 21:09:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:09:33.409791 543705 memory.go:184] no items to output this cycle
I0320 21:09:33.409799 543705 cpu.go:275] no items to output this cycle
I0320 21:09:38.620804 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 21:09:38.620811 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 21:09:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:09:43.410623 543705 memory.go:191] Add success.
I0320 21:09:43.409806 543705 cpu.go:282] Add success.
I0320 21:09:43.420334 543705 net.go:648] Add success.
I0320 21:09:43.422778 543705 net.go:770] primary dev: ETH0
I0320 21:09:43.422793 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:09:43.422806 543705 net.go:698] Add success.
I0320 21:09:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:09:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:09:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:09:53.410377 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:09:53.410395 543705 memory.go:184] no items to output this cycle
I0320 21:09:53.410408 543705 cpu.go:275] no items to output this cycle
E0320 21:10:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:10:03.409795 543705 memory.go:184] no items to output this cycle
I0320 21:10:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 21:10:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:10:13.409808 543705 memory.go:191] Add success.
I0320 21:10:13.409818 543705 cpu.go:282] Add success.
W0320 21:10:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:10:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:10:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:10:13.420247 543705 net.go:648] Add success.
I0320 21:10:13.422904 543705 net.go:770] primary dev: ETH0
I0320 21:10:13.422922 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:10:13.422937 543705 net.go:698] Add success.
I0320 21:10:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:10:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:10:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 21:10:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:10:14.456588 543705 disk_worker.go:494] system disk:vda1
I0320 21:10:14.456618 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:10:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:10:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:10:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:10:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:10:16.472414 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:10:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:10:23.409775 543705 memory.go:184] no items to output this cycle
I0320 21:10:23.409774 543705 cpu.go:275] no items to output this cycle
I0320 21:10:26.157669 543705 disk_info.go:125] begin check local disk info of client
I0320 21:10:26.160118 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:10:26.160124 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab740 0xc0001ab780]
E0320 21:10:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:10:33.409784 543705 memory.go:184] no items to output this cycle
I0320 21:10:33.409782 543705 cpu.go:275] no items to output this cycle
E0320 21:10:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:10:43.409799 543705 cpu.go:282] Add success.
I0320 21:10:43.409803 543705 memory.go:191] Add success.
I0320 21:10:43.420001 543705 net.go:648] Add success.
I0320 21:10:43.423255 543705 net.go:770] primary dev: ETH0
I0320 21:10:43.423267 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:10:43.423280 543705 net.go:698] Add success.
I0320 21:10:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:10:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:10:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:10:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:10:53.409867 543705 memory.go:184] no items to output this cycle
I0320 21:10:53.409924 543705 cpu.go:275] no items to output this cycle
E0320 21:11:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:11:03.409811 543705 memory.go:184] no items to output this cycle
I0320 21:11:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 21:11:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:11:13.409787 543705 memory.go:191] Add success.
I0320 21:11:13.409804 543705 cpu.go:282] Add success.
W0320 21:11:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:11:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:11:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:11:13.420195 543705 net.go:648] Add success.
I0320 21:11:13.423310 543705 net.go:770] primary dev: ETH0
I0320 21:11:13.423325 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:11:13.423339 543705 net.go:698] Add success.
I0320 21:11:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:11:14.455194 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:11:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0320 21:11:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:11:14.456601 543705 disk_worker.go:494] system disk:vda1
I0320 21:11:14.456634 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:11:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:11:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:11:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:11:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:11:16.472379 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:11:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:11:23.409776 543705 memory.go:184] no items to output this cycle
I0320 21:11:23.409780 543705 cpu.go:275] no items to output this cycle
I0320 21:11:26.161671 543705 disk_info.go:125] begin check local disk info of client
I0320 21:11:26.164188 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:11:26.164194 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ee1c0 0xc0003ee200]
E0320 21:11:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:11:33.409795 543705 memory.go:184] no items to output this cycle
I0320 21:11:33.409804 543705 cpu.go:275] no items to output this cycle
E0320 21:11:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:11:43.409788 543705 memory.go:191] Add success.
I0320 21:11:43.409803 543705 cpu.go:282] Add success.
I0320 21:11:43.420193 543705 net.go:648] Add success.
I0320 21:11:43.423272 543705 net.go:770] primary dev: ETH0
I0320 21:11:43.423285 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:11:43.423298 543705 net.go:698] Add success.
I0320 21:11:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:11:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:11:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:11:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:11:53.409765 543705 memory.go:184] no items to output this cycle
I0320 21:11:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 21:12:03.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:12:03.409819 543705 memory.go:184] no items to output this cycle
I0320 21:12:03.409832 543705 cpu.go:275] no items to output this cycle
E0320 21:12:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:12:13.409801 543705 memory.go:191] Add success.
I0320 21:12:13.409808 543705 cpu.go:282] Add success.
W0320 21:12:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:12:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:12:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:12:13.420072 543705 net.go:648] Add success.
I0320 21:12:13.422957 543705 net.go:770] primary dev: ETH0
I0320 21:12:13.422970 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:12:13.422983 543705 net.go:698] Add success.
I0320 21:12:13.470268 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4cf2026e-d948-4a07-928b-67f22d8154c7","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 21:12:13.470300 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 21:12:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:12:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0320 21:12:14.455194 543705 disk_worker.go:728] disk inode is not compliant
E0320 21:12:14.455965 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 21:12:14.455974 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 21:12:14.455980 543705 custom_config.go:64] query custom config with name: gpu
I0320 21:12:14.456793 543705 disk_worker.go:494] system disk:vda1
I0320 21:12:14.456825 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 21:12:15.456820 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 21:12:15.456830 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:12:16.457948 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 21:12:16.457948 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 21:12:16.458001 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:12:16.458021 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:12:16.472343 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:12:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:12:23.409765 543705 memory.go:184] no items to output this cycle
I0320 21:12:23.409788 543705 cpu.go:275] no items to output this cycle
I0320 21:12:26.165672 543705 disk_info.go:125] begin check local disk info of client
I0320 21:12:26.168067 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:12:26.168072 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001faa80 0xc0001faac0]
E0320 21:12:33.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:12:33.409760 543705 memory.go:184] no items to output this cycle
I0320 21:12:33.409799 543705 cpu.go:275] no items to output this cycle
I0320 21:12:38.621733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 21:12:38.621740 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 21:12:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:12:43.410624 543705 memory.go:191] Add success.
I0320 21:12:43.409799 543705 cpu.go:282] Add success.
I0320 21:12:43.420377 543705 net.go:648] Add success.
I0320 21:12:43.423293 543705 net.go:770] primary dev: ETH0
I0320 21:12:43.423305 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:12:43.423316 543705 net.go:698] Add success.
I0320 21:12:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:12:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:12:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:12:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:12:53.409796 543705 memory.go:184] no items to output this cycle
I0320 21:12:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 21:13:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:13:03.409775 543705 memory.go:184] no items to output this cycle
I0320 21:13:03.409796 543705 cpu.go:275] no items to output this cycle
I0320 21:13:13.409980 543705 cpu.go:282] Add success.
E0320 21:13:13.410016 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:13:13.410040 543705 memory.go:191] Add success.
W0320 21:13:13.410069 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:13:13.410240 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:13:13.410246 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:13:13.419746 543705 net.go:648] Add success.
I0320 21:13:13.422639 543705 net.go:770] primary dev: ETH0
I0320 21:13:13.422652 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:13:13.422663 543705 net.go:698] Add success.
I0320 21:13:14.454955 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:13:14.455136 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:13:14.455147 543705 disk_worker.go:708] disk space is not compliant
W0320 21:13:14.455150 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:13:14.456489 543705 disk_worker.go:494] system disk:vda1
I0320 21:13:14.456545 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:13:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:13:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:13:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:13:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:13:16.472427 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:13:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:13:23.409773 543705 memory.go:184] no items to output this cycle
I0320 21:13:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 21:13:26.169675 543705 disk_info.go:125] begin check local disk info of client
I0320 21:13:26.172119 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:13:26.172125 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001abec0 0xc0001abf00]
E0320 21:13:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:13:33.409765 543705 memory.go:184] no items to output this cycle
I0320 21:13:33.409799 543705 cpu.go:275] no items to output this cycle
E0320 21:13:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:13:43.409815 543705 memory.go:191] Add success.
I0320 21:13:43.409823 543705 cpu.go:282] Add success.
I0320 21:13:43.419849 543705 net.go:648] Add success.
I0320 21:13:43.422523 543705 net.go:770] primary dev: ETH0
I0320 21:13:43.422536 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:13:43.422548 543705 net.go:698] Add success.
I0320 21:13:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:13:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:13:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:13:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:13:53.409804 543705 memory.go:184] no items to output this cycle
I0320 21:13:53.409816 543705 cpu.go:275] no items to output this cycle
E0320 21:14:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:14:03.409785 543705 memory.go:184] no items to output this cycle
I0320 21:14:03.409792 543705 cpu.go:275] no items to output this cycle
W0320 21:14:13.409709 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:14:13.409726 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:14:13.409730 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:14:13.409851 543705 cpu.go:282] Add success.
E0320 21:14:13.409958 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:14:13.409996 543705 memory.go:191] Add success.
I0320 21:14:13.419732 543705 net.go:648] Add success.
I0320 21:14:13.422348 543705 net.go:770] primary dev: ETH0
I0320 21:14:13.422360 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:14:13.422372 543705 net.go:698] Add success.
I0320 21:14:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:14:14.455193 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:14:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0320 21:14:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:14:14.456574 543705 disk_worker.go:494] system disk:vda1
I0320 21:14:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:14:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:14:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:14:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:14:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:14:16.472439 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:14:23.410742 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:14:23.410757 543705 memory.go:184] no items to output this cycle
I0320 21:14:23.410760 543705 cpu.go:275] no items to output this cycle
I0320 21:14:26.173682 543705 disk_info.go:125] begin check local disk info of client
I0320 21:14:26.176202 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:14:26.176208 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab600 0xc0001ab640]
E0320 21:14:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:14:33.409801 543705 memory.go:184] no items to output this cycle
I0320 21:14:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 21:14:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:14:43.409803 543705 memory.go:191] Add success.
I0320 21:14:43.409823 543705 cpu.go:282] Add success.
I0320 21:14:43.419886 543705 net.go:648] Add success.
I0320 21:14:43.422705 543705 net.go:770] primary dev: ETH0
I0320 21:14:43.422719 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:14:43.422736 543705 net.go:698] Add success.
I0320 21:14:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:14:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:14:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:14:53.410343 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:14:53.410358 543705 memory.go:184] no items to output this cycle
I0320 21:14:53.410379 543705 cpu.go:275] no items to output this cycle
E0320 21:15:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:15:03.409786 543705 memory.go:184] no items to output this cycle
I0320 21:15:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 21:15:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:15:13.409803 543705 memory.go:191] Add success.
I0320 21:15:13.409809 543705 cpu.go:282] Add success.
W0320 21:15:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:15:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:15:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:15:13.420157 543705 net.go:648] Add success.
I0320 21:15:13.423520 543705 net.go:770] primary dev: ETH0
I0320 21:15:13.423536 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:15:13.423549 543705 net.go:698] Add success.
I0320 21:15:13.463566 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"82664d1a-b401-49f1-b518-4619e8582181","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 21:15:13.463599 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 21:15:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:15:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:15:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 21:15:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:15:14.456580 543705 disk_worker.go:494] system disk:vda1
I0320 21:15:14.456610 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:15:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:15:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:15:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:15:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:15:16.472403 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:15:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:15:23.409783 543705 memory.go:184] no items to output this cycle
I0320 21:15:23.409801 543705 cpu.go:275] no items to output this cycle
I0320 21:15:26.177676 543705 disk_info.go:125] begin check local disk info of client
I0320 21:15:26.180144 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:15:26.180152 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a8b80 0xc0002a8bc0]
E0320 21:15:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:15:33.409768 543705 memory.go:184] no items to output this cycle
I0320 21:15:33.409801 543705 cpu.go:275] no items to output this cycle
I0320 21:15:38.622820 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 21:15:38.622827 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 21:15:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:15:43.410716 543705 memory.go:191] Add success.
I0320 21:15:43.409803 543705 cpu.go:282] Add success.
I0320 21:15:43.420498 543705 net.go:648] Add success.
I0320 21:15:43.423394 543705 net.go:770] primary dev: ETH0
I0320 21:15:43.423408 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:15:43.423422 543705 net.go:698] Add success.
I0320 21:15:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:15:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:15:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:15:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:15:53.409772 543705 memory.go:184] no items to output this cycle
I0320 21:15:53.409778 543705 cpu.go:275] no items to output this cycle
E0320 21:16:03.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:16:03.409814 543705 memory.go:184] no items to output this cycle
I0320 21:16:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 21:16:13.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:16:13.409782 543705 memory.go:191] Add success.
W0320 21:16:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 21:16:13.409811 543705 cpu.go:282] Add success.
W0320 21:16:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:16:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:16:13.420194 543705 net.go:648] Add success.
I0320 21:16:13.423066 543705 net.go:770] primary dev: ETH0
I0320 21:16:13.423079 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:16:13.423091 543705 net.go:698] Add success.
I0320 21:16:14.454953 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:16:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:16:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 21:16:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:16:14.456578 543705 disk_worker.go:494] system disk:vda1
I0320 21:16:14.456606 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:16:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:16:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:16:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:16:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:16:16.472385 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:16:23.410475 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:16:23.410492 543705 memory.go:184] no items to output this cycle
I0320 21:16:23.410511 543705 cpu.go:275] no items to output this cycle
I0320 21:16:26.181673 543705 disk_info.go:125] begin check local disk info of client
I0320 21:16:26.184114 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:16:26.184119 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a9240 0xc0002a9280]
E0320 21:16:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:16:33.409780 543705 cpu.go:275] no items to output this cycle
I0320 21:16:33.409788 543705 memory.go:184] no items to output this cycle
E0320 21:16:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:16:43.409819 543705 memory.go:191] Add success.
I0320 21:16:43.409833 543705 cpu.go:282] Add success.
I0320 21:16:43.420006 543705 net.go:648] Add success.
I0320 21:16:43.422691 543705 net.go:770] primary dev: ETH0
I0320 21:16:43.422705 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:16:43.422720 543705 net.go:698] Add success.
I0320 21:16:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:16:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:16:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:16:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:16:53.409773 543705 memory.go:184] no items to output this cycle
I0320 21:16:53.409780 543705 cpu.go:275] no items to output this cycle
E0320 21:17:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:17:03.409777 543705 memory.go:184] no items to output this cycle
I0320 21:17:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 21:17:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:17:13.409813 543705 memory.go:191] Add success.
I0320 21:17:13.409819 543705 cpu.go:282] Add success.
W0320 21:17:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:17:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:17:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:17:13.420057 543705 net.go:648] Add success.
I0320 21:17:13.422711 543705 net.go:770] primary dev: ETH0
I0320 21:17:13.422725 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:17:13.422737 543705 net.go:698] Add success.
I0320 21:17:13.452865 543705 event_worker.go:152] Polling the log file for events...
W0320 21:17:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:17:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 21:17:14.455195 543705 disk_worker.go:728] disk inode is not compliant
E0320 21:17:14.455911 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 21:17:14.455920 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 21:17:14.455926 543705 custom_config.go:64] query custom config with name: gpu
I0320 21:17:14.456552 543705 disk_worker.go:494] system disk:vda1
I0320 21:17:14.456580 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 21:17:15.456831 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 21:17:15.456840 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:17:16.457934 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 21:17:16.457933 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 21:17:16.458006 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:17:16.458027 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:17:16.472350 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:17:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:17:23.409792 543705 memory.go:184] no items to output this cycle
I0320 21:17:23.409807 543705 cpu.go:275] no items to output this cycle
I0320 21:17:26.185672 543705 disk_info.go:125] begin check local disk info of client
I0320 21:17:26.188177 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:17:26.188183 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5b80 0xc0000c5bc0]
E0320 21:17:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:17:33.409805 543705 memory.go:184] no items to output this cycle
I0320 21:17:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 21:17:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:17:43.409778 543705 memory.go:191] Add success.
I0320 21:17:43.409806 543705 cpu.go:282] Add success.
I0320 21:17:43.419864 543705 net.go:648] Add success.
I0320 21:17:43.423228 543705 net.go:770] primary dev: ETH0
I0320 21:17:43.423243 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:17:43.423255 543705 net.go:698] Add success.
I0320 21:17:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:17:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:17:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:17:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:17:53.409773 543705 memory.go:184] no items to output this cycle
I0320 21:17:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 21:18:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:18:03.409782 543705 memory.go:184] no items to output this cycle
I0320 21:18:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 21:18:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:18:13.409791 543705 memory.go:191] Add success.
I0320 21:18:13.409801 543705 cpu.go:282] Add success.
W0320 21:18:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:18:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:18:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:18:13.420049 543705 net.go:648] Add success.
I0320 21:18:13.423256 543705 net.go:770] primary dev: ETH0
I0320 21:18:13.423270 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:18:13.423295 543705 net.go:698] Add success.
I0320 21:18:13.481709 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2eec28f2-0260-455e-a9a5-ee13c6970c9a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 21:18:13.481751 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 21:18:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:18:14.455185 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:18:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 21:18:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:18:14.456729 543705 disk_worker.go:494] system disk:vda1
I0320 21:18:14.456834 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:18:15.455613 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:18:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:18:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:18:16.458082 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:18:16.472406 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:18:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:18:23.409800 543705 memory.go:184] no items to output this cycle
I0320 21:18:23.409812 543705 cpu.go:275] no items to output this cycle
I0320 21:18:26.189669 543705 disk_info.go:125] begin check local disk info of client
I0320 21:18:26.192169 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:18:26.192176 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a8080 0xc0002a80c0]
E0320 21:18:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:18:33.409778 543705 memory.go:184] no items to output this cycle
I0320 21:18:33.409798 543705 cpu.go:275] no items to output this cycle
I0320 21:18:38.622970 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 21:18:38.622977 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 21:18:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:18:43.410734 543705 memory.go:191] Add success.
I0320 21:18:43.409820 543705 cpu.go:282] Add success.
I0320 21:18:43.420430 543705 net.go:648] Add success.
I0320 21:18:43.423019 543705 net.go:770] primary dev: ETH0
I0320 21:18:43.423032 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:18:43.423046 543705 net.go:698] Add success.
I0320 21:18:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:18:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:18:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:18:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:18:53.409783 543705 memory.go:184] no items to output this cycle
I0320 21:18:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 21:19:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:19:03.409777 543705 memory.go:184] no items to output this cycle
I0320 21:19:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 21:19:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:19:13.409793 543705 memory.go:191] Add success.
I0320 21:19:13.409811 543705 cpu.go:282] Add success.
W0320 21:19:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:19:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:19:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:19:13.420140 543705 net.go:648] Add success.
I0320 21:19:13.422878 543705 net.go:770] primary dev: ETH0
I0320 21:19:13.422891 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:19:13.422904 543705 net.go:698] Add success.
I0320 21:19:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:19:14.455097 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:19:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0320 21:19:14.455161 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:19:14.456505 543705 disk_worker.go:494] system disk:vda1
I0320 21:19:14.456551 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:19:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:19:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:19:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:19:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:19:16.472441 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:19:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:19:23.409791 543705 memory.go:184] no items to output this cycle
I0320 21:19:23.409794 543705 cpu.go:275] no items to output this cycle
I0320 21:19:26.193675 543705 disk_info.go:125] begin check local disk info of client
I0320 21:19:26.196149 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:19:26.196155 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a84c0 0xc0002a8500]
E0320 21:19:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:19:33.409769 543705 memory.go:184] no items to output this cycle
I0320 21:19:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 21:19:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:19:43.409794 543705 memory.go:191] Add success.
I0320 21:19:43.409806 543705 cpu.go:282] Add success.
I0320 21:19:43.419980 543705 net.go:648] Add success.
I0320 21:19:43.422898 543705 net.go:770] primary dev: ETH0
I0320 21:19:43.422912 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:19:43.422925 543705 net.go:698] Add success.
I0320 21:19:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:19:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:19:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:19:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:19:53.409804 543705 memory.go:184] no items to output this cycle
I0320 21:19:53.409815 543705 cpu.go:275] no items to output this cycle
E0320 21:20:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:20:03.409784 543705 memory.go:184] no items to output this cycle
I0320 21:20:03.409790 543705 cpu.go:275] no items to output this cycle
E0320 21:20:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:20:13.409789 543705 memory.go:191] Add success.
I0320 21:20:13.409813 543705 cpu.go:282] Add success.
W0320 21:20:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:20:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:20:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:20:13.420178 543705 net.go:648] Add success.
I0320 21:20:13.422756 543705 net.go:770] primary dev: ETH0
I0320 21:20:13.422771 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:20:13.422784 543705 net.go:698] Add success.
I0320 21:20:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:20:14.455118 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:20:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 21:20:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:20:14.456581 543705 disk_worker.go:494] system disk:vda1
I0320 21:20:14.456612 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:20:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:20:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:20:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:20:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:20:16.472404 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:20:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:20:23.409763 543705 memory.go:184] no items to output this cycle
I0320 21:20:23.409797 543705 cpu.go:275] no items to output this cycle
I0320 21:20:26.197678 543705 disk_info.go:125] begin check local disk info of client
I0320 21:20:26.200128 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:20:26.200135 543705 disk_info.go:196] parse disk info done, disk is : [0xc00058c7c0 0xc00058c800]
E0320 21:20:33.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:20:33.409817 543705 memory.go:184] no items to output this cycle
I0320 21:20:33.409827 543705 cpu.go:275] no items to output this cycle
E0320 21:20:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:20:43.409794 543705 memory.go:191] Add success.
I0320 21:20:43.409808 543705 cpu.go:282] Add success.
I0320 21:20:43.419910 543705 net.go:648] Add success.
I0320 21:20:43.422665 543705 net.go:770] primary dev: ETH0
I0320 21:20:43.422681 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:20:43.422695 543705 net.go:698] Add success.
I0320 21:20:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:20:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:20:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:20:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:20:53.409784 543705 memory.go:184] no items to output this cycle
I0320 21:20:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 21:21:03.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:21:03.409814 543705 memory.go:184] no items to output this cycle
I0320 21:21:03.409824 543705 cpu.go:275] no items to output this cycle
E0320 21:21:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:21:13.409795 543705 cpu.go:282] Add success.
I0320 21:21:13.409800 543705 memory.go:191] Add success.
W0320 21:21:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:21:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:21:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:21:13.419704 543705 net.go:648] Add success.
I0320 21:21:13.422145 543705 net.go:770] primary dev: ETH0
I0320 21:21:13.422158 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:21:13.422174 543705 net.go:698] Add success.
I0320 21:21:13.469147 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"6802158b-db72-4963-9b10-aeabb65fdef3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 21:21:13.469179 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 21:21:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:21:14.455158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:21:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0320 21:21:14.455170 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:21:14.456543 543705 disk_worker.go:494] system disk:vda1
I0320 21:21:14.456596 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:21:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:21:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:21:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:21:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:21:16.472516 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:21:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:21:23.409784 543705 memory.go:184] no items to output this cycle
I0320 21:21:23.409788 543705 cpu.go:275] no items to output this cycle
I0320 21:21:26.201679 543705 disk_info.go:125] begin check local disk info of client
I0320 21:21:26.204126 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:21:26.204133 543705 disk_info.go:196] parse disk info done, disk is : [0xc000280000 0xc000280040]
E0320 21:21:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:21:33.409790 543705 cpu.go:275] no items to output this cycle
I0320 21:21:33.409794 543705 memory.go:184] no items to output this cycle
I0320 21:21:38.623824 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 21:21:38.623830 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 21:21:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:21:43.410700 543705 memory.go:191] Add success.
I0320 21:21:43.409817 543705 cpu.go:282] Add success.
I0320 21:21:43.420395 543705 net.go:648] Add success.
I0320 21:21:43.423202 543705 net.go:770] primary dev: ETH0
I0320 21:21:43.423215 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:21:43.423227 543705 net.go:698] Add success.
I0320 21:21:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:21:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:21:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:21:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:21:53.409773 543705 memory.go:184] no items to output this cycle
I0320 21:21:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 21:22:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:22:03.409784 543705 memory.go:184] no items to output this cycle
I0320 21:22:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 21:22:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:22:13.409789 543705 memory.go:191] Add success.
I0320 21:22:13.409808 543705 cpu.go:282] Add success.
W0320 21:22:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:22:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:22:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:22:13.420120 543705 net.go:648] Add success.
I0320 21:22:13.422753 543705 net.go:770] primary dev: ETH0
I0320 21:22:13.422766 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:22:13.422778 543705 net.go:698] Add success.
W0320 21:22:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:22:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 21:22:14.455174 543705 disk_worker.go:728] disk inode is not compliant
E0320 21:22:14.456912 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 21:22:14.456921 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 21:22:14.456927 543705 custom_config.go:64] query custom config with name: gpu
I0320 21:22:14.456999 543705 disk_worker.go:494] system disk:vda1
I0320 21:22:14.457045 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 21:22:15.456850 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 21:22:15.456859 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:22:16.458003 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 21:22:16.458011 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 21:22:16.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:22:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:22:16.472389 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:22:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:22:23.409779 543705 memory.go:184] no items to output this cycle
I0320 21:22:23.409778 543705 cpu.go:275] no items to output this cycle
I0320 21:22:26.205674 543705 disk_info.go:125] begin check local disk info of client
I0320 21:22:26.208323 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:22:26.208334 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004bc000 0xc0004bc040]
E0320 21:22:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:22:33.409790 543705 memory.go:184] no items to output this cycle
I0320 21:22:33.409789 543705 cpu.go:275] no items to output this cycle
E0320 21:22:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:22:43.409804 543705 memory.go:191] Add success.
I0320 21:22:43.409807 543705 cpu.go:282] Add success.
I0320 21:22:43.419891 543705 net.go:648] Add success.
I0320 21:22:43.422705 543705 net.go:770] primary dev: ETH0
I0320 21:22:43.422720 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:22:43.422736 543705 net.go:698] Add success.
I0320 21:22:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:22:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:22:46.458088 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:22:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:22:53.409799 543705 memory.go:184] no items to output this cycle
I0320 21:22:53.409847 543705 cpu.go:275] no items to output this cycle
E0320 21:23:03.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:23:03.409804 543705 memory.go:184] no items to output this cycle
I0320 21:23:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 21:23:13.409815 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:23:13.409851 543705 memory.go:191] Add success.
I0320 21:23:13.409856 543705 cpu.go:282] Add success.
W0320 21:23:13.409884 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:23:13.409901 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:23:13.409905 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:23:13.420203 543705 net.go:648] Add success.
I0320 21:23:13.423026 543705 net.go:770] primary dev: ETH0
I0320 21:23:13.423039 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:23:13.423051 543705 net.go:698] Add success.
I0320 21:23:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:23:14.455197 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:23:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0320 21:23:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:23:14.456600 543705 disk_worker.go:494] system disk:vda1
I0320 21:23:14.456631 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:23:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:23:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:23:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:23:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:23:16.472384 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:23:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:23:23.409806 543705 memory.go:184] no items to output this cycle
I0320 21:23:23.409819 543705 cpu.go:275] no items to output this cycle
I0320 21:23:26.209671 543705 disk_info.go:125] begin check local disk info of client
I0320 21:23:26.212151 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:23:26.212157 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e8600 0xc0003e8640]
E0320 21:23:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:23:33.409784 543705 memory.go:184] no items to output this cycle
I0320 21:23:33.409804 543705 cpu.go:275] no items to output this cycle
E0320 21:23:43.409872 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:23:43.409923 543705 memory.go:191] Add success.
I0320 21:23:43.410074 543705 cpu.go:282] Add success.
I0320 21:23:43.419708 543705 net.go:648] Add success.
I0320 21:23:43.422490 543705 net.go:770] primary dev: ETH0
I0320 21:23:43.422502 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:23:43.422514 543705 net.go:698] Add success.
I0320 21:23:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:23:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:23:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:23:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:23:53.409790 543705 memory.go:184] no items to output this cycle
I0320 21:23:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 21:24:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:24:03.409785 543705 memory.go:184] no items to output this cycle
I0320 21:24:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 21:24:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:24:13.409793 543705 memory.go:191] Add success.
I0320 21:24:13.409814 543705 cpu.go:282] Add success.
W0320 21:24:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:24:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:24:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:24:13.420152 543705 net.go:648] Add success.
I0320 21:24:13.422832 543705 net.go:770] primary dev: ETH0
I0320 21:24:13.422847 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:24:13.422861 543705 net.go:698] Add success.
I0320 21:24:13.463361 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4095ec15-91c5-4e53-a092-c36baada3007","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 21:24:13.463394 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 21:24:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:24:14.455200 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:24:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0320 21:24:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:24:14.456702 543705 disk_worker.go:494] system disk:vda1
I0320 21:24:14.456737 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:24:15.455614 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:24:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:24:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:24:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:24:16.472442 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:24:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:24:23.409775 543705 memory.go:184] no items to output this cycle
I0320 21:24:23.409794 543705 cpu.go:275] no items to output this cycle
I0320 21:24:26.213685 543705 disk_info.go:125] begin check local disk info of client
I0320 21:24:26.215994 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:24:26.216001 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034d040 0xc00034d080]
E0320 21:24:33.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:24:33.409820 543705 memory.go:184] no items to output this cycle
I0320 21:24:33.409828 543705 cpu.go:275] no items to output this cycle
I0320 21:24:38.624824 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 21:24:38.624831 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 21:24:43.409842 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:24:43.410543 543705 memory.go:191] Add success.
I0320 21:24:43.409927 543705 cpu.go:282] Add success.
I0320 21:24:43.419755 543705 net.go:648] Add success.
I0320 21:24:43.422355 543705 net.go:770] primary dev: ETH0
I0320 21:24:43.422369 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:24:43.422383 543705 net.go:698] Add success.
I0320 21:24:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:24:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:24:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:24:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:24:53.409777 543705 memory.go:184] no items to output this cycle
I0320 21:24:53.409783 543705 cpu.go:275] no items to output this cycle
E0320 21:25:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:25:03.409777 543705 memory.go:184] no items to output this cycle
I0320 21:25:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 21:25:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:25:13.409791 543705 memory.go:191] Add success.
I0320 21:25:13.409811 543705 cpu.go:282] Add success.
W0320 21:25:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:25:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:25:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:25:13.420182 543705 net.go:648] Add success.
I0320 21:25:13.422942 543705 net.go:770] primary dev: ETH0
I0320 21:25:13.422955 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:25:13.422969 543705 net.go:698] Add success.
I0320 21:25:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:25:14.455105 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:25:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0320 21:25:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:25:14.456501 543705 disk_worker.go:494] system disk:vda1
I0320 21:25:14.456543 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:25:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:25:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:25:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:25:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:25:16.472416 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:25:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:25:23.409765 543705 memory.go:184] no items to output this cycle
I0320 21:25:23.409784 543705 cpu.go:275] no items to output this cycle
I0320 21:25:26.217678 543705 disk_info.go:125] begin check local disk info of client
I0320 21:25:26.220118 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:25:26.220124 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037cb00 0xc00037cb40]
E0320 21:25:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:25:33.409801 543705 memory.go:184] no items to output this cycle
I0320 21:25:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 21:25:43.409892 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:25:43.409920 543705 memory.go:191] Add success.
I0320 21:25:43.409936 543705 cpu.go:282] Add success.
I0320 21:25:43.419754 543705 net.go:648] Add success.
I0320 21:25:43.422269 543705 net.go:770] primary dev: ETH0
I0320 21:25:43.422284 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:25:43.422297 543705 net.go:698] Add success.
I0320 21:25:46.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:25:46.458069 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:25:46.458094 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:25:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:25:53.409782 543705 memory.go:184] no items to output this cycle
I0320 21:25:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 21:26:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:26:03.409781 543705 memory.go:184] no items to output this cycle
I0320 21:26:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 21:26:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:26:13.409799 543705 memory.go:191] Add success.
I0320 21:26:13.409821 543705 cpu.go:282] Add success.
W0320 21:26:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:26:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:26:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:26:13.420208 543705 net.go:648] Add success.
I0320 21:26:13.423142 543705 net.go:770] primary dev: ETH0
I0320 21:26:13.423157 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:26:13.423171 543705 net.go:698] Add success.
I0320 21:26:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:26:14.455199 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:26:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0320 21:26:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:26:14.456590 543705 disk_worker.go:494] system disk:vda1
I0320 21:26:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:26:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:26:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:26:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:26:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:26:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:26:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:26:23.409770 543705 memory.go:184] no items to output this cycle
I0320 21:26:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 21:26:26.221675 543705 disk_info.go:125] begin check local disk info of client
I0320 21:26:26.224213 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:26:26.224219 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5680 0xc0000c56c0]
E0320 21:26:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:26:33.409777 543705 memory.go:184] no items to output this cycle
I0320 21:26:33.409786 543705 cpu.go:275] no items to output this cycle
E0320 21:26:43.409869 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:26:43.409913 543705 cpu.go:282] Add success.
I0320 21:26:43.409945 543705 memory.go:191] Add success.
I0320 21:26:43.419739 543705 net.go:648] Add success.
I0320 21:26:43.422476 543705 net.go:770] primary dev: ETH0
I0320 21:26:43.422489 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:26:43.422501 543705 net.go:698] Add success.
I0320 21:26:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:26:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:26:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:26:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:26:53.409794 543705 memory.go:184] no items to output this cycle
I0320 21:26:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 21:27:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:27:03.409781 543705 memory.go:184] no items to output this cycle
I0320 21:27:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 21:27:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:27:13.409812 543705 memory.go:191] Add success.
I0320 21:27:13.409815 543705 cpu.go:282] Add success.
W0320 21:27:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:27:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:27:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:27:13.420137 543705 net.go:648] Add success.
I0320 21:27:13.422865 543705 net.go:770] primary dev: ETH0
I0320 21:27:13.422880 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:27:13.422895 543705 net.go:698] Add success.
I0320 21:27:13.428871 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 21:27:13.453046 543705 event_worker.go:152] Polling the log file for events...
I0320 21:27:13.463283 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d2d348c0-4c64-4492-a2f0-7218a20c57f6","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 21:27:13.463315 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 21:27:14.455138 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:27:14.455160 543705 disk_worker.go:708] disk space is not compliant
W0320 21:27:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:27:14.456868 543705 disk_worker.go:494] system disk:vda1
E0320 21:27:14.456888 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 21:27:14.456896 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 21:27:14.456900 543705 custom_config.go:64] query custom config with name: gpu
I0320 21:27:14.456914 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 21:27:15.456823 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 21:27:15.456831 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:27:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 21:27:16.457980 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 21:27:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:27:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:27:16.472408 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:27:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:27:23.409793 543705 memory.go:184] no items to output this cycle
I0320 21:27:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 21:27:26.225677 543705 disk_info.go:125] begin check local disk info of client
I0320 21:27:26.228265 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:27:26.228271 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037dc00 0xc00037dc40]
E0320 21:27:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:27:33.409770 543705 memory.go:184] no items to output this cycle
I0320 21:27:33.409791 543705 cpu.go:275] no items to output this cycle
I0320 21:27:38.625731 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 21:27:38.625737 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 21:27:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:27:43.410787 543705 memory.go:191] Add success.
I0320 21:27:43.409971 543705 cpu.go:282] Add success.
I0320 21:27:43.419701 543705 net.go:648] Add success.
I0320 21:27:43.422194 543705 net.go:770] primary dev: ETH0
I0320 21:27:43.422206 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:27:43.422218 543705 net.go:698] Add success.
I0320 21:27:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:27:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:27:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:27:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:27:53.409771 543705 memory.go:184] no items to output this cycle
I0320 21:27:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 21:28:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:28:03.409788 543705 memory.go:184] no items to output this cycle
I0320 21:28:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 21:28:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:28:13.409839 543705 memory.go:191] Add success.
I0320 21:28:13.409844 543705 cpu.go:282] Add success.
W0320 21:28:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:28:13.409891 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:28:13.409895 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:28:13.420177 543705 net.go:648] Add success.
I0320 21:28:13.423314 543705 net.go:770] primary dev: ETH0
I0320 21:28:13.423327 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:28:13.423339 543705 net.go:698] Add success.
I0320 21:28:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:28:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:28:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 21:28:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:28:14.456569 543705 disk_worker.go:494] system disk:vda1
I0320 21:28:14.456601 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:28:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:28:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:28:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:28:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:28:16.472381 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:28:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:28:23.409783 543705 memory.go:184] no items to output this cycle
I0320 21:28:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 21:28:26.229692 543705 disk_info.go:125] begin check local disk info of client
I0320 21:28:26.232182 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:28:26.232188 543705 disk_info.go:196] parse disk info done, disk is : [0xc00029f200 0xc00029f240]
E0320 21:28:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:28:33.409776 543705 memory.go:184] no items to output this cycle
I0320 21:28:33.409782 543705 cpu.go:275] no items to output this cycle
E0320 21:28:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:28:43.409912 543705 cpu.go:282] Add success.
I0320 21:28:43.409921 543705 memory.go:191] Add success.
I0320 21:28:43.419728 543705 net.go:648] Add success.
I0320 21:28:43.422413 543705 net.go:770] primary dev: ETH0
I0320 21:28:43.422428 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:28:43.422442 543705 net.go:698] Add success.
I0320 21:28:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:28:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:28:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:28:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:28:53.409782 543705 cpu.go:275] no items to output this cycle
I0320 21:28:53.409793 543705 memory.go:184] no items to output this cycle
E0320 21:29:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:29:03.409804 543705 memory.go:184] no items to output this cycle
I0320 21:29:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 21:29:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:29:13.409799 543705 cpu.go:282] Add success.
I0320 21:29:13.409805 543705 memory.go:191] Add success.
W0320 21:29:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:29:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:29:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:29:13.420296 543705 net.go:648] Add success.
I0320 21:29:13.422895 543705 net.go:770] primary dev: ETH0
I0320 21:29:13.422910 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:29:13.422924 543705 net.go:698] Add success.
I0320 21:29:14.454949 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:29:14.455104 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:29:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0320 21:29:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:29:14.456553 543705 disk_worker.go:494] system disk:vda1
I0320 21:29:14.456584 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:29:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:29:16.457997 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:29:16.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:29:16.458083 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:29:16.472412 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:29:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:29:23.409775 543705 memory.go:184] no items to output this cycle
I0320 21:29:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 21:29:26.233678 543705 disk_info.go:125] begin check local disk info of client
I0320 21:29:26.236155 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:29:26.236161 543705 disk_info.go:196] parse disk info done, disk is : [0xc000460140 0xc000460180]
E0320 21:29:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:29:33.409794 543705 memory.go:184] no items to output this cycle
I0320 21:29:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 21:29:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:29:43.409796 543705 memory.go:191] Add success.
I0320 21:29:43.409799 543705 cpu.go:282] Add success.
I0320 21:29:43.419830 543705 net.go:648] Add success.
I0320 21:29:43.422572 543705 net.go:770] primary dev: ETH0
I0320 21:29:43.422588 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:29:43.422601 543705 net.go:698] Add success.
I0320 21:29:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:29:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:29:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:29:53.410332 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:29:53.410349 543705 memory.go:184] no items to output this cycle
I0320 21:29:53.410377 543705 cpu.go:275] no items to output this cycle
E0320 21:30:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:30:03.409785 543705 memory.go:184] no items to output this cycle
I0320 21:30:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 21:30:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:30:13.409823 543705 memory.go:191] Add success.
I0320 21:30:13.409830 543705 cpu.go:282] Add success.
W0320 21:30:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:30:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:30:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:30:13.420186 543705 net.go:648] Add success.
I0320 21:30:13.422817 543705 net.go:770] primary dev: ETH0
I0320 21:30:13.422830 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:30:13.422842 543705 net.go:698] Add success.
I0320 21:30:13.469313 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b441df17-4a61-4fbe-8886-c8d6b17310cf","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 21:30:13.469347 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 21:30:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:30:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:30:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 21:30:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:30:14.456569 543705 disk_worker.go:494] system disk:vda1
I0320 21:30:14.456621 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:30:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:30:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:30:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:30:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:30:16.472400 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:30:23.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:30:23.409810 543705 memory.go:184] no items to output this cycle
I0320 21:30:23.409822 543705 cpu.go:275] no items to output this cycle
I0320 21:30:26.237672 543705 disk_info.go:125] begin check local disk info of client
I0320 21:30:26.240070 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:30:26.240077 543705 disk_info.go:196] parse disk info done, disk is : [0xc000486a40 0xc000486a80]
E0320 21:30:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:30:33.409788 543705 cpu.go:275] no items to output this cycle
I0320 21:30:33.409792 543705 memory.go:184] no items to output this cycle
I0320 21:30:38.626830 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 21:30:38.626837 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 21:30:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:30:43.410765 543705 memory.go:191] Add success.
I0320 21:30:43.409812 543705 cpu.go:282] Add success.
I0320 21:30:43.420444 543705 net.go:648] Add success.
I0320 21:30:43.423477 543705 net.go:770] primary dev: ETH0
I0320 21:30:43.423491 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:30:43.423503 543705 net.go:698] Add success.
I0320 21:30:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:30:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:30:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:30:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:30:53.409767 543705 memory.go:184] no items to output this cycle
I0320 21:30:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 21:31:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:31:03.409791 543705 memory.go:184] no items to output this cycle
I0320 21:31:03.409809 543705 cpu.go:275] no items to output this cycle
E0320 21:31:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:31:13.409784 543705 memory.go:191] Add success.
I0320 21:31:13.409806 543705 cpu.go:282] Add success.
W0320 21:31:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:31:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:31:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:31:13.420136 543705 net.go:648] Add success.
I0320 21:31:13.422887 543705 net.go:770] primary dev: ETH0
I0320 21:31:13.422899 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:31:13.422911 543705 net.go:698] Add success.
I0320 21:31:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:31:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:31:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 21:31:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:31:14.456516 543705 disk_worker.go:494] system disk:vda1
I0320 21:31:14.456562 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:31:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:31:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:31:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:31:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:31:16.472411 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:31:23.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:31:23.409810 543705 memory.go:184] no items to output this cycle
I0320 21:31:23.409818 543705 cpu.go:275] no items to output this cycle
I0320 21:31:26.241674 543705 disk_info.go:125] begin check local disk info of client
I0320 21:31:26.244143 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:31:26.244149 543705 disk_info.go:196] parse disk info done, disk is : [0xc00029f700 0xc00029f740]
E0320 21:31:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:31:33.409800 543705 memory.go:184] no items to output this cycle
I0320 21:31:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 21:31:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:31:43.409796 543705 memory.go:191] Add success.
I0320 21:31:43.409800 543705 cpu.go:282] Add success.
I0320 21:31:43.419980 543705 net.go:648] Add success.
I0320 21:31:43.422760 543705 net.go:770] primary dev: ETH0
I0320 21:31:43.422774 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:31:43.422786 543705 net.go:698] Add success.
I0320 21:31:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:31:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:31:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:31:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:31:53.409781 543705 memory.go:184] no items to output this cycle
I0320 21:31:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 21:32:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:32:03.409781 543705 memory.go:184] no items to output this cycle
I0320 21:32:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 21:32:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:32:13.409818 543705 memory.go:191] Add success.
I0320 21:32:13.409821 543705 cpu.go:282] Add success.
W0320 21:32:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:32:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:32:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:32:13.420249 543705 net.go:648] Add success.
I0320 21:32:13.423104 543705 net.go:770] primary dev: ETH0
I0320 21:32:13.423119 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:32:13.423133 543705 net.go:698] Add success.
W0320 21:32:14.455105 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:32:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 21:32:14.455169 543705 disk_worker.go:728] disk inode is not compliant
E0320 21:32:14.456806 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 21:32:14.456815 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 21:32:14.456822 543705 custom_config.go:64] query custom config with name: gpu
I0320 21:32:14.456867 543705 disk_worker.go:494] system disk:vda1
I0320 21:32:14.456909 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 21:32:15.456846 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 21:32:15.456854 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:32:16.457924 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 21:32:16.457922 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 21:32:16.457978 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:32:16.457998 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:32:16.472320 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:32:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:32:23.409803 543705 memory.go:184] no items to output this cycle
I0320 21:32:23.409813 543705 cpu.go:275] no items to output this cycle
I0320 21:32:26.245674 543705 disk_info.go:125] begin check local disk info of client
I0320 21:32:26.248196 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:32:26.248203 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab040 0xc0001ab080]
E0320 21:32:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:32:33.409803 543705 memory.go:184] no items to output this cycle
I0320 21:32:33.409809 543705 cpu.go:275] no items to output this cycle
E0320 21:32:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:32:43.409774 543705 memory.go:191] Add success.
I0320 21:32:43.409801 543705 cpu.go:282] Add success.
I0320 21:32:43.419876 543705 net.go:648] Add success.
I0320 21:32:43.422523 543705 net.go:770] primary dev: ETH0
I0320 21:32:43.422537 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:32:43.422548 543705 net.go:698] Add success.
I0320 21:32:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:32:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:32:46.458056 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:32:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:32:53.409768 543705 memory.go:184] no items to output this cycle
I0320 21:32:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 21:33:03.409882 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:33:03.409938 543705 cpu.go:275] no items to output this cycle
I0320 21:33:03.409967 543705 memory.go:184] no items to output this cycle
E0320 21:33:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:33:13.409818 543705 memory.go:191] Add success.
I0320 21:33:13.409829 543705 cpu.go:282] Add success.
W0320 21:33:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:33:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:33:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:33:13.420169 543705 net.go:648] Add success.
I0320 21:33:13.422734 543705 net.go:770] primary dev: ETH0
I0320 21:33:13.422752 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:33:13.422768 543705 net.go:698] Add success.
I0320 21:33:13.468695 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8e9df1ee-5e2c-4c3f-9582-ec32ed4015fd","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 21:33:13.468727 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 21:33:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:33:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:33:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0320 21:33:14.455180 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:33:14.456508 543705 disk_worker.go:494] system disk:vda1
I0320 21:33:14.456553 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:33:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:33:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:33:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:33:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:33:16.472421 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:33:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:33:23.409784 543705 cpu.go:275] no items to output this cycle
I0320 21:33:23.409791 543705 memory.go:184] no items to output this cycle
I0320 21:33:26.249675 543705 disk_info.go:125] begin check local disk info of client
I0320 21:33:26.252175 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:33:26.252181 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5cc0 0xc0000c5d00]
E0320 21:33:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:33:33.409805 543705 memory.go:184] no items to output this cycle
I0320 21:33:33.409816 543705 cpu.go:275] no items to output this cycle
I0320 21:33:38.627844 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 21:33:38.627852 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 21:33:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:33:43.410838 543705 memory.go:191] Add success.
I0320 21:33:43.409820 543705 cpu.go:282] Add success.
I0320 21:33:43.420561 543705 net.go:648] Add success.
I0320 21:33:43.423199 543705 net.go:770] primary dev: ETH0
I0320 21:33:43.423211 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:33:43.423224 543705 net.go:698] Add success.
I0320 21:33:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:33:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:33:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:33:53.410386 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:33:53.410403 543705 cpu.go:275] no items to output this cycle
I0320 21:33:53.410415 543705 memory.go:184] no items to output this cycle
E0320 21:34:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:34:03.409785 543705 memory.go:184] no items to output this cycle
I0320 21:34:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 21:34:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:34:13.409824 543705 memory.go:191] Add success.
I0320 21:34:13.409828 543705 cpu.go:282] Add success.
W0320 21:34:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:34:13.409873 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:34:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:34:13.420227 543705 net.go:648] Add success.
I0320 21:34:13.422884 543705 net.go:770] primary dev: ETH0
I0320 21:34:13.422899 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:34:13.422914 543705 net.go:698] Add success.
I0320 21:34:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:34:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:34:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 21:34:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:34:14.456579 543705 disk_worker.go:494] system disk:vda1
I0320 21:34:14.456608 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:34:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:34:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:34:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:34:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:34:16.472375 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:34:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:34:23.409800 543705 memory.go:184] no items to output this cycle
I0320 21:34:23.409814 543705 cpu.go:275] no items to output this cycle
I0320 21:34:26.253673 543705 disk_info.go:125] begin check local disk info of client
I0320 21:34:26.256235 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:34:26.256241 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b2f00 0xc0002b2f40]
E0320 21:34:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:34:33.409782 543705 memory.go:184] no items to output this cycle
I0320 21:34:33.409782 543705 cpu.go:275] no items to output this cycle
E0320 21:34:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:34:43.409809 543705 memory.go:191] Add success.
I0320 21:34:43.409819 543705 cpu.go:282] Add success.
I0320 21:34:43.419857 543705 net.go:648] Add success.
I0320 21:34:43.422677 543705 net.go:770] primary dev: ETH0
I0320 21:34:43.422695 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:34:43.422709 543705 net.go:698] Add success.
I0320 21:34:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:34:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:34:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:34:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:34:53.409795 543705 memory.go:184] no items to output this cycle
I0320 21:34:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 21:35:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:35:03.409791 543705 cpu.go:275] no items to output this cycle
I0320 21:35:03.409794 543705 memory.go:184] no items to output this cycle
E0320 21:35:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:35:13.409781 543705 memory.go:191] Add success.
W0320 21:35:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 21:35:13.409809 543705 cpu.go:282] Add success.
W0320 21:35:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:35:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:35:13.420057 543705 net.go:648] Add success.
I0320 21:35:13.422686 543705 net.go:770] primary dev: ETH0
I0320 21:35:13.422699 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:35:13.422710 543705 net.go:698] Add success.
I0320 21:35:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:35:14.455183 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:35:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0320 21:35:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:35:14.456580 543705 disk_worker.go:494] system disk:vda1
I0320 21:35:14.456624 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:35:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:35:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:35:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:35:16.458076 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:35:16.472473 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:35:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:35:23.409797 543705 memory.go:184] no items to output this cycle
I0320 21:35:23.409807 543705 cpu.go:275] no items to output this cycle
I0320 21:35:26.257673 543705 disk_info.go:125] begin check local disk info of client
I0320 21:35:26.260188 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:35:26.260194 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b2ec0 0xc0002b2f00]
E0320 21:35:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:35:33.409764 543705 memory.go:184] no items to output this cycle
I0320 21:35:33.409797 543705 cpu.go:275] no items to output this cycle
E0320 21:35:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:35:43.409823 543705 memory.go:191] Add success.
I0320 21:35:43.409827 543705 cpu.go:282] Add success.
I0320 21:35:43.419871 543705 net.go:648] Add success.
I0320 21:35:43.422687 543705 net.go:770] primary dev: ETH0
I0320 21:35:43.422699 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:35:43.422710 543705 net.go:698] Add success.
I0320 21:35:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:35:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:35:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:35:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:35:53.409793 543705 memory.go:184] no items to output this cycle
I0320 21:35:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 21:36:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:36:03.409785 543705 memory.go:184] no items to output this cycle
I0320 21:36:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 21:36:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:36:13.409803 543705 cpu.go:282] Add success.
I0320 21:36:13.409805 543705 memory.go:191] Add success.
W0320 21:36:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:36:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:36:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:36:13.420139 543705 net.go:648] Add success.
I0320 21:36:13.422935 543705 net.go:770] primary dev: ETH0
I0320 21:36:13.422949 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:36:13.422962 543705 net.go:698] Add success.
I0320 21:36:13.469471 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1a1faa47-f056-4699-93c2-ea89a653f508","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 21:36:13.469504 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 21:36:14.454953 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:36:14.455193 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:36:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0320 21:36:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:36:14.456593 543705 disk_worker.go:494] system disk:vda1
I0320 21:36:14.456622 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:36:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:36:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:36:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:36:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:36:16.472442 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:36:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:36:23.409778 543705 memory.go:184] no items to output this cycle
I0320 21:36:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 21:36:26.261674 543705 disk_info.go:125] begin check local disk info of client
I0320 21:36:26.264207 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:36:26.264213 543705 disk_info.go:196] parse disk info done, disk is : [0xc000352f00 0xc000352f40]
E0320 21:36:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:36:33.409809 543705 memory.go:184] no items to output this cycle
I0320 21:36:33.409823 543705 cpu.go:275] no items to output this cycle
I0320 21:36:38.628847 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 21:36:38.628854 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 21:36:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:36:43.410638 543705 memory.go:191] Add success.
I0320 21:36:43.409820 543705 cpu.go:282] Add success.
I0320 21:36:43.420339 543705 net.go:648] Add success.
I0320 21:36:43.422882 543705 net.go:770] primary dev: ETH0
I0320 21:36:43.422896 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:36:43.422910 543705 net.go:698] Add success.
I0320 21:36:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:36:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:36:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:36:53.409862 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:36:53.409880 543705 memory.go:184] no items to output this cycle
I0320 21:36:53.409948 543705 cpu.go:275] no items to output this cycle
E0320 21:37:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:37:03.409821 543705 memory.go:184] no items to output this cycle
I0320 21:37:03.409836 543705 cpu.go:275] no items to output this cycle
E0320 21:37:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:37:13.409796 543705 cpu.go:282] Add success.
I0320 21:37:13.409799 543705 memory.go:191] Add success.
W0320 21:37:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:37:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:37:13.409860 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:37:13.420110 543705 net.go:648] Add success.
I0320 21:37:13.423145 543705 net.go:770] primary dev: ETH0
I0320 21:37:13.423158 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:37:13.423171 543705 net.go:698] Add success.
I0320 21:37:13.453669 543705 event_worker.go:152] Polling the log file for events...
W0320 21:37:14.455111 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:37:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0320 21:37:14.455176 543705 disk_worker.go:728] disk inode is not compliant
E0320 21:37:14.457030 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 21:37:14.457040 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 21:37:14.457047 543705 custom_config.go:64] query custom config with name: gpu
I0320 21:37:14.457095 543705 disk_worker.go:494] system disk:vda1
I0320 21:37:14.457141 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 21:37:15.456796 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 21:37:15.456805 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:37:16.457945 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 21:37:16.457945 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 21:37:16.458000 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:37:16.458019 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:37:16.472453 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:37:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:37:23.409797 543705 memory.go:184] no items to output this cycle
I0320 21:37:23.409807 543705 cpu.go:275] no items to output this cycle
I0320 21:37:26.265672 543705 disk_info.go:125] begin check local disk info of client
I0320 21:37:26.268160 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:37:26.268165 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b35c0 0xc0002b3600]
E0320 21:37:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:37:33.409795 543705 memory.go:184] no items to output this cycle
I0320 21:37:33.409807 543705 cpu.go:275] no items to output this cycle
E0320 21:37:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:37:43.409773 543705 memory.go:191] Add success.
I0320 21:37:43.409806 543705 cpu.go:282] Add success.
I0320 21:37:43.419854 543705 net.go:648] Add success.
I0320 21:37:43.422497 543705 net.go:770] primary dev: ETH0
I0320 21:37:43.422509 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:37:43.422523 543705 net.go:698] Add success.
I0320 21:37:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:37:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:37:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:37:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:37:53.409778 543705 memory.go:184] no items to output this cycle
I0320 21:37:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 21:38:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:38:03.409815 543705 memory.go:184] no items to output this cycle
I0320 21:38:03.409826 543705 cpu.go:275] no items to output this cycle
E0320 21:38:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:38:13.409788 543705 memory.go:191] Add success.
I0320 21:38:13.409807 543705 cpu.go:282] Add success.
W0320 21:38:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:38:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:38:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:38:13.420243 543705 net.go:648] Add success.
I0320 21:38:13.423017 543705 net.go:770] primary dev: ETH0
I0320 21:38:13.423031 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:38:13.423046 543705 net.go:698] Add success.
I0320 21:38:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:38:14.455133 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:38:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0320 21:38:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:38:14.456591 543705 disk_worker.go:494] system disk:vda1
I0320 21:38:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:38:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:38:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:38:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:38:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:38:16.472404 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:38:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:38:23.409785 543705 memory.go:184] no items to output this cycle
I0320 21:38:23.409788 543705 cpu.go:275] no items to output this cycle
I0320 21:38:26.269677 543705 disk_info.go:125] begin check local disk info of client
I0320 21:38:26.272150 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:38:26.272156 543705 disk_info.go:196] parse disk info done, disk is : [0xc000278ac0 0xc000278b00]
E0320 21:38:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:38:33.409770 543705 memory.go:184] no items to output this cycle
I0320 21:38:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 21:38:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:38:43.409820 543705 memory.go:191] Add success.
I0320 21:38:43.409826 543705 cpu.go:282] Add success.
I0320 21:38:43.419954 543705 net.go:648] Add success.
I0320 21:38:43.422606 543705 net.go:770] primary dev: ETH0
I0320 21:38:43.422619 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:38:43.422631 543705 net.go:698] Add success.
I0320 21:38:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:38:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:38:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:38:53.410244 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:38:53.410265 543705 memory.go:184] no items to output this cycle
I0320 21:38:53.410289 543705 cpu.go:275] no items to output this cycle
E0320 21:39:03.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:39:03.409813 543705 memory.go:184] no items to output this cycle
I0320 21:39:03.409826 543705 cpu.go:275] no items to output this cycle
E0320 21:39:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:39:13.409822 543705 memory.go:191] Add success.
I0320 21:39:13.409822 543705 cpu.go:282] Add success.
W0320 21:39:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:39:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:39:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:39:13.420219 543705 net.go:648] Add success.
I0320 21:39:13.422822 543705 net.go:770] primary dev: ETH0
I0320 21:39:13.422838 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:39:13.422851 543705 net.go:698] Add success.
I0320 21:39:13.469755 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5fc8f91d-21cb-44d1-935b-f514917a1f96","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 21:39:13.469788 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 21:39:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:39:14.455205 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:39:14.455215 543705 disk_worker.go:708] disk space is not compliant
W0320 21:39:14.455218 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:39:14.456627 543705 disk_worker.go:494] system disk:vda1
I0320 21:39:14.456657 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:39:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:39:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:39:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:39:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:39:16.472390 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:39:23.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:39:23.409766 543705 memory.go:184] no items to output this cycle
I0320 21:39:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 21:39:26.273670 543705 disk_info.go:125] begin check local disk info of client
I0320 21:39:26.276162 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:39:26.276167 543705 disk_info.go:196] parse disk info done, disk is : [0xc00046d900 0xc00046d940]
E0320 21:39:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:39:33.409797 543705 memory.go:184] no items to output this cycle
I0320 21:39:33.409813 543705 cpu.go:275] no items to output this cycle
I0320 21:39:38.629744 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 21:39:38.629752 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 21:39:43.409924 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:39:43.410680 543705 memory.go:191] Add success.
I0320 21:39:43.410064 543705 cpu.go:282] Add success.
I0320 21:39:43.419719 543705 net.go:648] Add success.
I0320 21:39:43.422578 543705 net.go:770] primary dev: ETH0
I0320 21:39:43.422591 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:39:43.422602 543705 net.go:698] Add success.
I0320 21:39:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:39:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:39:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:39:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:39:53.409770 543705 memory.go:184] no items to output this cycle
I0320 21:39:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 21:40:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:40:03.409775 543705 memory.go:184] no items to output this cycle
I0320 21:40:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 21:40:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:40:13.409818 543705 memory.go:191] Add success.
I0320 21:40:13.409827 543705 cpu.go:282] Add success.
W0320 21:40:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:40:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:40:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:40:13.420347 543705 net.go:648] Add success.
I0320 21:40:13.423018 543705 net.go:770] primary dev: ETH0
I0320 21:40:13.423032 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:40:13.423043 543705 net.go:698] Add success.
I0320 21:40:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:40:14.455215 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:40:14.455225 543705 disk_worker.go:708] disk space is not compliant
W0320 21:40:14.455228 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:40:14.456618 543705 disk_worker.go:494] system disk:vda1
I0320 21:40:14.456651 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:40:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:40:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:40:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:40:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:40:16.472478 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:40:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:40:23.409786 543705 memory.go:184] no items to output this cycle
I0320 21:40:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 21:40:26.277674 543705 disk_info.go:125] begin check local disk info of client
I0320 21:40:26.280194 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:40:26.280201 543705 disk_info.go:196] parse disk info done, disk is : [0xc000346380 0xc0003463c0]
E0320 21:40:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:40:33.409772 543705 memory.go:184] no items to output this cycle
I0320 21:40:33.409802 543705 cpu.go:275] no items to output this cycle
E0320 21:40:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:40:43.409793 543705 cpu.go:282] Add success.
I0320 21:40:43.409794 543705 memory.go:191] Add success.
I0320 21:40:43.419979 543705 net.go:648] Add success.
I0320 21:40:43.422889 543705 net.go:770] primary dev: ETH0
I0320 21:40:43.422904 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:40:43.422918 543705 net.go:698] Add success.
I0320 21:40:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:40:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:40:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:40:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:40:53.409763 543705 memory.go:184] no items to output this cycle
I0320 21:40:53.409778 543705 cpu.go:275] no items to output this cycle
E0320 21:41:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:41:03.409813 543705 memory.go:184] no items to output this cycle
I0320 21:41:03.409824 543705 cpu.go:275] no items to output this cycle
E0320 21:41:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:41:13.409798 543705 memory.go:191] Add success.
I0320 21:41:13.409799 543705 cpu.go:282] Add success.
W0320 21:41:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:41:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:41:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:41:13.420143 543705 net.go:648] Add success.
I0320 21:41:13.422712 543705 net.go:770] primary dev: ETH0
I0320 21:41:13.422727 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:41:13.422741 543705 net.go:698] Add success.
I0320 21:41:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:41:14.455185 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:41:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 21:41:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:41:14.456596 543705 disk_worker.go:494] system disk:vda1
I0320 21:41:14.456628 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:41:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:41:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:41:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:41:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:41:16.472429 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:41:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:41:23.409768 543705 memory.go:184] no items to output this cycle
I0320 21:41:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 21:41:26.281673 543705 disk_info.go:125] begin check local disk info of client
I0320 21:41:26.284125 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:41:26.284132 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa0c0 0xc0001aa100]
E0320 21:41:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:41:33.409767 543705 memory.go:184] no items to output this cycle
I0320 21:41:33.409806 543705 cpu.go:275] no items to output this cycle
E0320 21:41:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:41:43.409811 543705 memory.go:191] Add success.
I0320 21:41:43.409824 543705 cpu.go:282] Add success.
I0320 21:41:43.419934 543705 net.go:648] Add success.
I0320 21:41:43.422761 543705 net.go:770] primary dev: ETH0
I0320 21:41:43.422774 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:41:43.422786 543705 net.go:698] Add success.
I0320 21:41:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:41:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:41:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:41:53.410365 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:41:53.410380 543705 memory.go:184] no items to output this cycle
I0320 21:41:53.410383 543705 cpu.go:275] no items to output this cycle
E0320 21:42:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:42:03.409777 543705 memory.go:184] no items to output this cycle
I0320 21:42:03.409800 543705 cpu.go:275] no items to output this cycle
E0320 21:42:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:42:13.409815 543705 memory.go:191] Add success.
I0320 21:42:13.409822 543705 cpu.go:282] Add success.
W0320 21:42:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:42:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:42:13.409866 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:42:13.420108 543705 net.go:648] Add success.
I0320 21:42:13.423018 543705 net.go:770] primary dev: ETH0
I0320 21:42:13.423031 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:42:13.423056 543705 net.go:698] Add success.
I0320 21:42:13.469917 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e1cfd884-7c8f-4a38-b00f-0cf48e0f86e0","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 21:42:13.469955 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 21:42:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:42:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0320 21:42:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:42:14.456910 543705 disk_worker.go:494] system disk:vda1
E0320 21:42:14.456919 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 21:42:14.456927 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 21:42:14.456931 543705 custom_config.go:64] query custom config with name: gpu
I0320 21:42:14.456954 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 21:42:15.456839 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 21:42:15.456848 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:42:16.457923 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 21:42:16.457929 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 21:42:16.457978 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:42:16.457997 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:42:16.472308 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:42:23.410380 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:42:23.410405 543705 memory.go:184] no items to output this cycle
I0320 21:42:23.410408 543705 cpu.go:275] no items to output this cycle
I0320 21:42:26.285674 543705 disk_info.go:125] begin check local disk info of client
I0320 21:42:26.288145 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:42:26.288151 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa8c0 0xc0001aa900]
E0320 21:42:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:42:33.409774 543705 memory.go:184] no items to output this cycle
I0320 21:42:33.409776 543705 cpu.go:275] no items to output this cycle
I0320 21:42:38.630845 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 21:42:38.630851 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 21:42:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:42:43.410777 543705 memory.go:191] Add success.
I0320 21:42:43.409789 543705 cpu.go:282] Add success.
I0320 21:42:43.420460 543705 net.go:648] Add success.
I0320 21:42:43.423078 543705 net.go:770] primary dev: ETH0
I0320 21:42:43.423091 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:42:43.423104 543705 net.go:698] Add success.
I0320 21:42:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:42:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:42:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:42:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:42:53.409778 543705 memory.go:184] no items to output this cycle
I0320 21:42:53.409780 543705 cpu.go:275] no items to output this cycle
E0320 21:43:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:43:03.409804 543705 memory.go:184] no items to output this cycle
I0320 21:43:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 21:43:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:43:13.409785 543705 memory.go:191] Add success.
I0320 21:43:13.409802 543705 cpu.go:282] Add success.
W0320 21:43:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:43:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:43:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:43:13.420132 543705 net.go:648] Add success.
I0320 21:43:13.423133 543705 net.go:770] primary dev: ETH0
I0320 21:43:13.423146 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:43:13.423157 543705 net.go:698] Add success.
I0320 21:43:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:43:14.455140 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:43:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0320 21:43:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:43:14.456860 543705 disk_worker.go:494] system disk:vda1
I0320 21:43:14.456894 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:43:15.455947 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:43:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:43:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:43:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:43:16.472406 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:43:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:43:23.409762 543705 memory.go:184] no items to output this cycle
I0320 21:43:23.409800 543705 cpu.go:275] no items to output this cycle
I0320 21:43:26.289677 543705 disk_info.go:125] begin check local disk info of client
I0320 21:43:26.292090 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:43:26.292096 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a9080 0xc0002a90c0]
E0320 21:43:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:43:33.409799 543705 memory.go:184] no items to output this cycle
I0320 21:43:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 21:43:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:43:43.409811 543705 memory.go:191] Add success.
I0320 21:43:43.409817 543705 cpu.go:282] Add success.
I0320 21:43:43.419905 543705 net.go:648] Add success.
I0320 21:43:43.423076 543705 net.go:770] primary dev: ETH0
I0320 21:43:43.423091 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:43:43.423107 543705 net.go:698] Add success.
I0320 21:43:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:43:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:43:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:43:53.410202 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:43:53.410229 543705 memory.go:184] no items to output this cycle
I0320 21:43:53.410235 543705 cpu.go:275] no items to output this cycle
E0320 21:44:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:44:03.409805 543705 memory.go:184] no items to output this cycle
I0320 21:44:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 21:44:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:44:13.409788 543705 memory.go:191] Add success.
I0320 21:44:13.409807 543705 cpu.go:282] Add success.
W0320 21:44:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:44:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:44:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:44:13.420219 543705 net.go:648] Add success.
I0320 21:44:13.422832 543705 net.go:770] primary dev: ETH0
I0320 21:44:13.422844 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:44:13.422855 543705 net.go:698] Add success.
I0320 21:44:14.454951 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:44:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:44:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 21:44:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:44:14.456614 543705 disk_worker.go:494] system disk:vda1
I0320 21:44:14.456642 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:44:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:44:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:44:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:44:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:44:16.472394 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:44:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:44:23.409774 543705 cpu.go:275] no items to output this cycle
I0320 21:44:23.409782 543705 memory.go:184] no items to output this cycle
I0320 21:44:26.293676 543705 disk_info.go:125] begin check local disk info of client
I0320 21:44:26.296153 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:44:26.296159 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b700 0xc00007b740]
E0320 21:44:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:44:33.409770 543705 memory.go:184] no items to output this cycle
I0320 21:44:33.409777 543705 cpu.go:275] no items to output this cycle
E0320 21:44:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:44:43.409789 543705 memory.go:191] Add success.
I0320 21:44:43.409791 543705 cpu.go:282] Add success.
I0320 21:44:43.419851 543705 net.go:648] Add success.
I0320 21:44:43.422734 543705 net.go:770] primary dev: ETH0
I0320 21:44:43.422747 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:44:43.422759 543705 net.go:698] Add success.
I0320 21:44:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:44:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:44:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:44:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:44:53.409775 543705 memory.go:184] no items to output this cycle
I0320 21:44:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 21:45:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:45:03.409793 543705 memory.go:184] no items to output this cycle
I0320 21:45:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 21:45:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:45:13.409903 543705 cpu.go:282] Add success.
I0320 21:45:13.409914 543705 memory.go:191] Add success.
W0320 21:45:13.409956 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:45:13.409987 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:45:13.409992 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:45:13.419746 543705 net.go:648] Add success.
I0320 21:45:13.422364 543705 net.go:770] primary dev: ETH0
I0320 21:45:13.422379 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:45:13.422392 543705 net.go:698] Add success.
I0320 21:45:13.464030 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7d17b432-c0cd-4dcb-a9d8-04539dbf695c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 21:45:13.464062 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 21:45:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:45:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:45:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 21:45:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:45:14.456485 543705 disk_worker.go:494] system disk:vda1
I0320 21:45:14.456529 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:45:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:45:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:45:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:45:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:45:16.472400 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:45:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:45:23.409786 543705 memory.go:184] no items to output this cycle
I0320 21:45:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 21:45:26.297675 543705 disk_info.go:125] begin check local disk info of client
I0320 21:45:26.300161 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:45:26.300167 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004b1840 0xc0004b1880]
E0320 21:45:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:45:33.409784 543705 memory.go:184] no items to output this cycle
I0320 21:45:33.409804 543705 cpu.go:275] no items to output this cycle
I0320 21:45:38.631860 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 21:45:38.631867 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 21:45:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:45:43.410718 543705 memory.go:191] Add success.
I0320 21:45:43.409830 543705 cpu.go:282] Add success.
I0320 21:45:43.420442 543705 net.go:648] Add success.
I0320 21:45:43.423153 543705 net.go:770] primary dev: ETH0
I0320 21:45:43.423167 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:45:43.423179 543705 net.go:698] Add success.
I0320 21:45:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:45:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:45:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:45:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:45:53.409800 543705 memory.go:184] no items to output this cycle
I0320 21:45:53.409801 543705 cpu.go:275] no items to output this cycle
I0320 21:46:03.409894 543705 cpu.go:275] no items to output this cycle
E0320 21:46:03.409890 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:46:03.409933 543705 memory.go:184] no items to output this cycle
E0320 21:46:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:46:13.409789 543705 memory.go:191] Add success.
I0320 21:46:13.409816 543705 cpu.go:282] Add success.
W0320 21:46:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:46:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:46:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:46:13.420174 543705 net.go:648] Add success.
I0320 21:46:13.423417 543705 net.go:770] primary dev: ETH0
I0320 21:46:13.423430 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:46:13.423442 543705 net.go:698] Add success.
I0320 21:46:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:46:14.455209 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:46:14.455221 543705 disk_worker.go:708] disk space is not compliant
W0320 21:46:14.455224 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:46:14.456589 543705 disk_worker.go:494] system disk:vda1
I0320 21:46:14.456617 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:46:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:46:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:46:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:46:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:46:16.472389 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:46:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:46:23.409793 543705 memory.go:184] no items to output this cycle
I0320 21:46:23.409806 543705 cpu.go:275] no items to output this cycle
I0320 21:46:26.301673 543705 disk_info.go:125] begin check local disk info of client
I0320 21:46:26.304249 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:46:26.304256 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fbd40 0xc0001fbd80]
E0320 21:46:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:46:33.409771 543705 memory.go:184] no items to output this cycle
I0320 21:46:33.409791 543705 cpu.go:275] no items to output this cycle
E0320 21:46:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:46:43.409821 543705 memory.go:191] Add success.
I0320 21:46:43.409822 543705 cpu.go:282] Add success.
I0320 21:46:43.420030 543705 net.go:648] Add success.
I0320 21:46:43.422602 543705 net.go:770] primary dev: ETH0
I0320 21:46:43.422617 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:46:43.422631 543705 net.go:698] Add success.
I0320 21:46:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:46:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:46:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:46:53.409858 543705 cpu.go:275] no items to output this cycle
E0320 21:46:53.409915 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:46:53.409927 543705 memory.go:184] no items to output this cycle
E0320 21:47:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:47:03.409788 543705 memory.go:184] no items to output this cycle
I0320 21:47:03.409809 543705 cpu.go:275] no items to output this cycle
E0320 21:47:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:47:13.409818 543705 memory.go:191] Add success.
I0320 21:47:13.409822 543705 cpu.go:282] Add success.
W0320 21:47:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:47:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:47:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:47:13.420148 543705 net.go:648] Add success.
I0320 21:47:13.422970 543705 net.go:770] primary dev: ETH0
I0320 21:47:13.422984 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:47:13.422997 543705 net.go:698] Add success.
I0320 21:47:13.453531 543705 event_worker.go:152] Polling the log file for events...
W0320 21:47:14.455176 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:47:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0320 21:47:14.455192 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:47:14.456807 543705 disk_worker.go:494] system disk:vda1
I0320 21:47:14.456847 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 21:47:14.457136 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 21:47:14.457144 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 21:47:14.457149 543705 custom_config.go:64] query custom config with name: gpu
E0320 21:47:15.456814 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 21:47:15.456823 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:47:16.457932 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 21:47:16.457932 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 21:47:16.457986 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:47:16.458006 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:47:16.472342 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:47:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:47:23.409789 543705 memory.go:184] no items to output this cycle
I0320 21:47:23.409802 543705 cpu.go:275] no items to output this cycle
I0320 21:47:26.305669 543705 disk_info.go:125] begin check local disk info of client
I0320 21:47:26.308166 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:47:26.308172 543705 disk_info.go:196] parse disk info done, disk is : [0xc000370140 0xc000370180]
E0320 21:47:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:47:33.409798 543705 memory.go:184] no items to output this cycle
I0320 21:47:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 21:47:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:47:43.409809 543705 memory.go:191] Add success.
I0320 21:47:43.409818 543705 cpu.go:282] Add success.
I0320 21:47:43.419989 543705 net.go:648] Add success.
I0320 21:47:43.422814 543705 net.go:770] primary dev: ETH0
I0320 21:47:43.422827 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:47:43.422839 543705 net.go:698] Add success.
I0320 21:47:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:47:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:47:46.458088 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:47:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:47:53.409768 543705 memory.go:184] no items to output this cycle
I0320 21:47:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 21:48:03.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:48:03.409800 543705 cpu.go:275] no items to output this cycle
I0320 21:48:03.409811 543705 memory.go:184] no items to output this cycle
E0320 21:48:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:48:13.409831 543705 memory.go:191] Add success.
I0320 21:48:13.409840 543705 cpu.go:282] Add success.
W0320 21:48:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:48:13.409880 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:48:13.409883 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:48:13.420224 543705 net.go:648] Add success.
I0320 21:48:13.423181 543705 net.go:770] primary dev: ETH0
I0320 21:48:13.423211 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:48:13.423224 543705 net.go:698] Add success.
I0320 21:48:13.471085 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"57f910c1-e8f8-4a8d-8710-56dd1c582052","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 21:48:13.471119 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 21:48:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:48:14.455233 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:48:14.455244 543705 disk_worker.go:708] disk space is not compliant
W0320 21:48:14.455247 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:48:14.456763 543705 disk_worker.go:494] system disk:vda1
I0320 21:48:14.456793 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:48:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:48:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:48:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:48:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:48:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:48:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:48:23.409765 543705 memory.go:184] no items to output this cycle
I0320 21:48:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 21:48:26.309673 543705 disk_info.go:125] begin check local disk info of client
I0320 21:48:26.312146 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:48:26.312153 543705 disk_info.go:196] parse disk info done, disk is : [0xc000461640 0xc000461680]
E0320 21:48:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:48:33.409797 543705 memory.go:184] no items to output this cycle
I0320 21:48:33.409809 543705 cpu.go:275] no items to output this cycle
I0320 21:48:38.632008 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 21:48:38.632014 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 21:48:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:48:43.410566 543705 memory.go:191] Add success.
I0320 21:48:43.409804 543705 cpu.go:282] Add success.
I0320 21:48:43.420297 543705 net.go:648] Add success.
I0320 21:48:43.422793 543705 net.go:770] primary dev: ETH0
I0320 21:48:43.422806 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:48:43.422818 543705 net.go:698] Add success.
I0320 21:48:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:48:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:48:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:48:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:48:53.409786 543705 memory.go:184] no items to output this cycle
I0320 21:48:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 21:49:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:49:03.409796 543705 memory.go:184] no items to output this cycle
I0320 21:49:03.409809 543705 cpu.go:275] no items to output this cycle
E0320 21:49:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:49:13.409802 543705 memory.go:191] Add success.
I0320 21:49:13.409805 543705 cpu.go:282] Add success.
W0320 21:49:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:49:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:49:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:49:13.420204 543705 net.go:648] Add success.
I0320 21:49:13.423080 543705 net.go:770] primary dev: ETH0
I0320 21:49:13.423095 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:49:13.423109 543705 net.go:698] Add success.
I0320 21:49:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:49:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:49:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 21:49:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:49:14.456566 543705 disk_worker.go:494] system disk:vda1
I0320 21:49:14.456597 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:49:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:49:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:49:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:49:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:49:16.472435 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:49:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:49:23.409782 543705 memory.go:184] no items to output this cycle
I0320 21:49:23.409786 543705 cpu.go:275] no items to output this cycle
I0320 21:49:26.313672 543705 disk_info.go:125] begin check local disk info of client
I0320 21:49:26.316149 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:49:26.316155 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004af3c0 0xc0004af400]
E0320 21:49:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:49:33.409779 543705 memory.go:184] no items to output this cycle
I0320 21:49:33.409792 543705 cpu.go:275] no items to output this cycle
E0320 21:49:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:49:43.409783 543705 memory.go:191] Add success.
I0320 21:49:43.409807 543705 cpu.go:282] Add success.
I0320 21:49:43.419874 543705 net.go:648] Add success.
I0320 21:49:43.422735 543705 net.go:770] primary dev: ETH0
I0320 21:49:43.422749 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:49:43.422761 543705 net.go:698] Add success.
I0320 21:49:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:49:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:49:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:49:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:49:53.409773 543705 memory.go:184] no items to output this cycle
I0320 21:49:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 21:50:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:50:03.409803 543705 memory.go:184] no items to output this cycle
I0320 21:50:03.409917 543705 cpu.go:275] no items to output this cycle
E0320 21:50:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:50:13.409801 543705 memory.go:191] Add success.
I0320 21:50:13.409803 543705 cpu.go:282] Add success.
W0320 21:50:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:50:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:50:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:50:13.420072 543705 net.go:648] Add success.
I0320 21:50:13.423192 543705 net.go:770] primary dev: ETH0
I0320 21:50:13.423208 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:50:13.423223 543705 net.go:698] Add success.
I0320 21:50:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:50:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:50:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 21:50:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:50:14.456578 543705 disk_worker.go:494] system disk:vda1
I0320 21:50:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:50:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:50:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:50:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:50:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:50:16.472403 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:50:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:50:23.409799 543705 memory.go:184] no items to output this cycle
I0320 21:50:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 21:50:26.317678 543705 disk_info.go:125] begin check local disk info of client
I0320 21:50:26.320184 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:50:26.320191 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b02c0 0xc0003b0300]
E0320 21:50:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:50:33.409781 543705 memory.go:184] no items to output this cycle
I0320 21:50:33.409802 543705 cpu.go:275] no items to output this cycle
E0320 21:50:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:50:43.409802 543705 memory.go:191] Add success.
I0320 21:50:43.409803 543705 cpu.go:282] Add success.
I0320 21:50:43.419983 543705 net.go:648] Add success.
I0320 21:50:43.422628 543705 net.go:770] primary dev: ETH0
I0320 21:50:43.422641 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:50:43.422654 543705 net.go:698] Add success.
I0320 21:50:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:50:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:50:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:50:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:50:53.409782 543705 memory.go:184] no items to output this cycle
I0320 21:50:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 21:51:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:51:03.409802 543705 memory.go:184] no items to output this cycle
I0320 21:51:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 21:51:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:51:13.409807 543705 memory.go:191] Add success.
I0320 21:51:13.409808 543705 cpu.go:282] Add success.
W0320 21:51:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:51:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:51:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:51:13.420137 543705 net.go:648] Add success.
I0320 21:51:13.422894 543705 net.go:770] primary dev: ETH0
I0320 21:51:13.422908 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:51:13.422920 543705 net.go:698] Add success.
I0320 21:51:13.488104 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"6a19251c-6362-45ad-a55b-70f7a5162382","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 21:51:13.488138 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 21:51:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:51:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:51:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0320 21:51:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:51:14.456676 543705 disk_worker.go:494] system disk:vda1
I0320 21:51:14.456706 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:51:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:51:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:51:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:51:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:51:16.472494 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:51:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:51:23.409782 543705 memory.go:184] no items to output this cycle
I0320 21:51:23.409801 543705 cpu.go:275] no items to output this cycle
I0320 21:51:26.321675 543705 disk_info.go:125] begin check local disk info of client
I0320 21:51:26.324163 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:51:26.324169 543705 disk_info.go:196] parse disk info done, disk is : [0xc000342440 0xc000342480]
E0320 21:51:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:51:33.409778 543705 memory.go:184] no items to output this cycle
I0320 21:51:33.409782 543705 cpu.go:275] no items to output this cycle
I0320 21:51:38.632864 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 21:51:38.632870 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 21:51:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:51:43.410606 543705 memory.go:191] Add success.
I0320 21:51:43.409833 543705 cpu.go:282] Add success.
I0320 21:51:43.420367 543705 net.go:648] Add success.
I0320 21:51:43.423047 543705 net.go:770] primary dev: ETH0
I0320 21:51:43.423061 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:51:43.423075 543705 net.go:698] Add success.
I0320 21:51:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:51:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:51:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:51:53.409897 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:51:53.409917 543705 memory.go:184] no items to output this cycle
I0320 21:51:53.409991 543705 cpu.go:275] no items to output this cycle
E0320 21:52:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:52:03.409786 543705 memory.go:184] no items to output this cycle
I0320 21:52:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 21:52:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:52:13.409816 543705 memory.go:191] Add success.
I0320 21:52:13.409819 543705 cpu.go:282] Add success.
W0320 21:52:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:52:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:52:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:52:13.420222 543705 net.go:648] Add success.
I0320 21:52:13.423182 543705 net.go:770] primary dev: ETH0
I0320 21:52:13.423199 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:52:13.423214 543705 net.go:698] Add success.
W0320 21:52:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:52:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0320 21:52:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:52:14.456800 543705 disk_worker.go:494] system disk:vda1
I0320 21:52:14.456838 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 21:52:14.457123 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 21:52:14.457131 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 21:52:14.457136 543705 custom_config.go:64] query custom config with name: gpu
E0320 21:52:15.456816 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 21:52:15.456825 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:52:16.457948 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 21:52:16.457948 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 21:52:16.458005 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:52:16.458025 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:52:16.472411 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:52:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:52:23.409799 543705 memory.go:184] no items to output this cycle
I0320 21:52:23.409813 543705 cpu.go:275] no items to output this cycle
I0320 21:52:26.325676 543705 disk_info.go:125] begin check local disk info of client
I0320 21:52:26.328185 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:52:26.328192 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003bf380 0xc0003bf3c0]
E0320 21:52:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:52:33.409778 543705 cpu.go:275] no items to output this cycle
I0320 21:52:33.409786 543705 memory.go:184] no items to output this cycle
E0320 21:52:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:52:43.409807 543705 memory.go:191] Add success.
I0320 21:52:43.409818 543705 cpu.go:282] Add success.
I0320 21:52:43.420174 543705 net.go:648] Add success.
I0320 21:52:43.422796 543705 net.go:770] primary dev: ETH0
I0320 21:52:43.422809 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:52:43.422821 543705 net.go:698] Add success.
I0320 21:52:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:52:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:52:46.458089 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:52:53.410354 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:52:53.410371 543705 memory.go:184] no items to output this cycle
I0320 21:52:53.410388 543705 cpu.go:275] no items to output this cycle
E0320 21:53:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:53:03.409788 543705 memory.go:184] no items to output this cycle
I0320 21:53:03.409809 543705 cpu.go:275] no items to output this cycle
E0320 21:53:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:53:13.409824 543705 memory.go:191] Add success.
I0320 21:53:13.409829 543705 cpu.go:282] Add success.
W0320 21:53:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:53:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:53:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:53:13.420155 543705 net.go:648] Add success.
I0320 21:53:13.422999 543705 net.go:770] primary dev: ETH0
I0320 21:53:13.423025 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:53:13.423039 543705 net.go:698] Add success.
I0320 21:53:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:53:14.455104 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:53:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0320 21:53:14.455170 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:53:14.456483 543705 disk_worker.go:494] system disk:vda1
I0320 21:53:14.456527 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:53:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:53:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:53:16.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:53:16.458079 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:53:16.472401 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:53:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:53:23.409805 543705 memory.go:184] no items to output this cycle
I0320 21:53:23.409816 543705 cpu.go:275] no items to output this cycle
I0320 21:53:26.329674 543705 disk_info.go:125] begin check local disk info of client
I0320 21:53:26.332198 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:53:26.332204 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002988c0 0xc000298900]
E0320 21:53:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:53:33.409804 543705 memory.go:184] no items to output this cycle
I0320 21:53:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 21:53:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:53:43.409787 543705 memory.go:191] Add success.
I0320 21:53:43.409806 543705 cpu.go:282] Add success.
I0320 21:53:43.420135 543705 net.go:648] Add success.
I0320 21:53:43.422830 543705 net.go:770] primary dev: ETH0
I0320 21:53:43.422844 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:53:43.422855 543705 net.go:698] Add success.
I0320 21:53:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:53:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:53:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:53:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:53:53.409807 543705 memory.go:184] no items to output this cycle
I0320 21:53:53.409814 543705 cpu.go:275] no items to output this cycle
E0320 21:54:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:54:03.409789 543705 memory.go:184] no items to output this cycle
I0320 21:54:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 21:54:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:54:13.409795 543705 memory.go:191] Add success.
W0320 21:54:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 21:54:13.409823 543705 cpu.go:282] Add success.
W0320 21:54:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:54:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:54:13.420126 543705 net.go:648] Add success.
I0320 21:54:13.422920 543705 net.go:770] primary dev: ETH0
I0320 21:54:13.422933 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:54:13.422944 543705 net.go:698] Add success.
I0320 21:54:13.470943 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b03b9a2a-c85c-4002-86a1-7ed658fdf110","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 21:54:13.470977 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 21:54:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:54:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:54:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 21:54:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:54:14.456539 543705 disk_worker.go:494] system disk:vda1
I0320 21:54:14.456593 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:54:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:54:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:54:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:54:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:54:16.472470 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:54:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:54:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 21:54:23.409787 543705 memory.go:184] no items to output this cycle
I0320 21:54:26.333678 543705 disk_info.go:125] begin check local disk info of client
I0320 21:54:26.336130 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:54:26.336136 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003acc80 0xc0003accc0]
E0320 21:54:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:54:33.409782 543705 cpu.go:275] no items to output this cycle
I0320 21:54:33.409788 543705 memory.go:184] no items to output this cycle
I0320 21:54:38.633740 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 21:54:38.633747 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 21:54:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:54:43.410816 543705 memory.go:191] Add success.
I0320 21:54:43.409790 543705 cpu.go:282] Add success.
I0320 21:54:43.420594 543705 net.go:648] Add success.
I0320 21:54:43.423444 543705 net.go:770] primary dev: ETH0
I0320 21:54:43.423457 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:54:43.423468 543705 net.go:698] Add success.
I0320 21:54:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:54:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:54:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:54:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:54:53.409764 543705 memory.go:184] no items to output this cycle
I0320 21:54:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 21:55:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:55:03.409779 543705 memory.go:184] no items to output this cycle
I0320 21:55:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 21:55:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:55:13.409816 543705 memory.go:191] Add success.
I0320 21:55:13.409820 543705 cpu.go:282] Add success.
W0320 21:55:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:55:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:55:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:55:13.420191 543705 net.go:648] Add success.
I0320 21:55:13.422894 543705 net.go:770] primary dev: ETH0
I0320 21:55:13.422906 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:55:13.422919 543705 net.go:698] Add success.
I0320 21:55:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:55:14.455191 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:55:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 21:55:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:55:14.456590 543705 disk_worker.go:494] system disk:vda1
I0320 21:55:14.456623 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:55:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:55:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:55:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:55:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:55:16.472378 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:55:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:55:23.409794 543705 memory.go:184] no items to output this cycle
I0320 21:55:23.409805 543705 cpu.go:275] no items to output this cycle
I0320 21:55:26.337670 543705 disk_info.go:125] begin check local disk info of client
I0320 21:55:26.340195 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:55:26.340203 543705 disk_info.go:196] parse disk info done, disk is : [0xc000369cc0 0xc000369d00]
E0320 21:55:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:55:33.409767 543705 memory.go:184] no items to output this cycle
I0320 21:55:33.409785 543705 cpu.go:275] no items to output this cycle
E0320 21:55:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:55:43.409796 543705 memory.go:191] Add success.
I0320 21:55:43.409798 543705 cpu.go:282] Add success.
I0320 21:55:43.419877 543705 net.go:648] Add success.
I0320 21:55:43.422451 543705 net.go:770] primary dev: ETH0
I0320 21:55:43.422464 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:55:43.422475 543705 net.go:698] Add success.
I0320 21:55:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:55:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:55:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:55:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:55:53.409767 543705 memory.go:184] no items to output this cycle
I0320 21:55:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 21:56:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:56:03.409788 543705 memory.go:184] no items to output this cycle
I0320 21:56:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 21:56:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:56:13.409795 543705 memory.go:191] Add success.
I0320 21:56:13.409799 543705 cpu.go:282] Add success.
W0320 21:56:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:56:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:56:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:56:13.420069 543705 net.go:648] Add success.
I0320 21:56:13.423084 543705 net.go:770] primary dev: ETH0
I0320 21:56:13.423097 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:56:13.423110 543705 net.go:698] Add success.
I0320 21:56:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:56:14.455167 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:56:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 21:56:14.455182 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:56:14.456560 543705 disk_worker.go:494] system disk:vda1
I0320 21:56:14.456589 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:56:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:56:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:56:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:56:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:56:16.472382 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:56:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:56:23.409778 543705 cpu.go:275] no items to output this cycle
I0320 21:56:23.409781 543705 memory.go:184] no items to output this cycle
I0320 21:56:26.341674 543705 disk_info.go:125] begin check local disk info of client
I0320 21:56:26.344123 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:56:26.344129 543705 disk_info.go:196] parse disk info done, disk is : [0xc000497b40 0xc000497b80]
E0320 21:56:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:56:33.409773 543705 cpu.go:275] no items to output this cycle
I0320 21:56:33.409775 543705 memory.go:184] no items to output this cycle
E0320 21:56:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:56:43.409793 543705 memory.go:191] Add success.
I0320 21:56:43.409797 543705 cpu.go:282] Add success.
I0320 21:56:43.419992 543705 net.go:648] Add success.
I0320 21:56:43.422711 543705 net.go:770] primary dev: ETH0
I0320 21:56:43.422724 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:56:43.422737 543705 net.go:698] Add success.
I0320 21:56:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:56:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:56:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:56:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:56:53.409771 543705 memory.go:184] no items to output this cycle
I0320 21:56:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 21:57:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:57:03.409786 543705 memory.go:184] no items to output this cycle
I0320 21:57:03.409802 543705 cpu.go:275] no items to output this cycle
E0320 21:57:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:57:13.409785 543705 memory.go:191] Add success.
I0320 21:57:13.409789 543705 cpu.go:282] Add success.
W0320 21:57:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:57:13.412506 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:57:13.412512 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:57:13.420104 543705 net.go:648] Add success.
I0320 21:57:13.428176 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 21:57:13.428259 543705 net.go:770] primary dev: ETH0
I0320 21:57:13.428272 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:57:13.428284 543705 net.go:698] Add success.
I0320 21:57:13.452772 543705 event_worker.go:152] Polling the log file for events...
I0320 21:57:13.468883 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2874e436-cfb2-49ba-bff5-787d430486eb","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 21:57:13.468917 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 21:57:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:57:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 21:57:14.455182 543705 disk_worker.go:728] disk inode is not compliant
E0320 21:57:14.456796 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 21:57:14.456805 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 21:57:14.456811 543705 custom_config.go:64] query custom config with name: gpu
I0320 21:57:14.456840 543705 disk_worker.go:494] system disk:vda1
I0320 21:57:14.456870 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 21:57:15.456891 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 21:57:15.456900 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:57:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 21:57:16.457988 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 21:57:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:57:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:57:16.472414 543705 disk_local_worker.go:436] Get disk info: []
I0320 21:57:23.409930 543705 cpu.go:275] no items to output this cycle
E0320 21:57:23.409931 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:57:23.409982 543705 memory.go:184] no items to output this cycle
I0320 21:57:26.345670 543705 disk_info.go:125] begin check local disk info of client
I0320 21:57:26.348112 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:57:26.348118 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004de4c0 0xc0004de500]
E0320 21:57:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:57:33.409778 543705 memory.go:184] no items to output this cycle
I0320 21:57:33.409781 543705 cpu.go:275] no items to output this cycle
I0320 21:57:38.633901 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 21:57:38.633907 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 21:57:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:57:43.410587 543705 memory.go:191] Add success.
I0320 21:57:43.409794 543705 cpu.go:282] Add success.
I0320 21:57:43.420322 543705 net.go:648] Add success.
I0320 21:57:43.423031 543705 net.go:770] primary dev: ETH0
I0320 21:57:43.423044 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:57:43.423056 543705 net.go:698] Add success.
I0320 21:57:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:57:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:57:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:57:53.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:57:53.409759 543705 memory.go:184] no items to output this cycle
I0320 21:57:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 21:58:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:58:03.409793 543705 memory.go:184] no items to output this cycle
I0320 21:58:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 21:58:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:58:13.409806 543705 memory.go:191] Add success.
I0320 21:58:13.409807 543705 cpu.go:282] Add success.
W0320 21:58:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:58:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:58:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:58:13.420090 543705 net.go:648] Add success.
I0320 21:58:13.422923 543705 net.go:770] primary dev: ETH0
I0320 21:58:13.422936 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:58:13.422949 543705 net.go:698] Add success.
I0320 21:58:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:58:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:58:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0320 21:58:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:58:14.456597 543705 disk_worker.go:494] system disk:vda1
I0320 21:58:14.456626 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:58:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:58:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:58:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:58:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:58:16.472409 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:58:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:58:23.409785 543705 memory.go:184] no items to output this cycle
I0320 21:58:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 21:58:26.349676 543705 disk_info.go:125] begin check local disk info of client
I0320 21:58:26.352148 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:58:26.352154 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004dfac0 0xc0004dfb00]
E0320 21:58:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:58:33.409788 543705 memory.go:184] no items to output this cycle
I0320 21:58:33.409797 543705 cpu.go:275] no items to output this cycle
E0320 21:58:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:58:43.409798 543705 memory.go:191] Add success.
I0320 21:58:43.409801 543705 cpu.go:282] Add success.
I0320 21:58:43.419985 543705 net.go:648] Add success.
I0320 21:58:43.422637 543705 net.go:770] primary dev: ETH0
I0320 21:58:43.422649 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:58:43.422662 543705 net.go:698] Add success.
I0320 21:58:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:58:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:58:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:58:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:58:53.409797 543705 memory.go:184] no items to output this cycle
I0320 21:58:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 21:59:03.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:59:03.409822 543705 memory.go:184] no items to output this cycle
I0320 21:59:03.409832 543705 cpu.go:275] no items to output this cycle
E0320 21:59:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:59:13.409779 543705 memory.go:191] Add success.
I0320 21:59:13.409814 543705 cpu.go:282] Add success.
W0320 21:59:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 21:59:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 21:59:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 21:59:13.420116 543705 net.go:648] Add success.
I0320 21:59:13.422857 543705 net.go:770] primary dev: ETH0
I0320 21:59:13.422870 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:59:13.422882 543705 net.go:698] Add success.
I0320 21:59:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 21:59:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 21:59:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0320 21:59:14.455171 543705 disk_worker.go:728] disk inode is not compliant
I0320 21:59:14.456506 543705 disk_worker.go:494] system disk:vda1
I0320 21:59:14.456552 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 21:59:15.455952 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 21:59:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:59:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:59:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 21:59:16.472399 543705 disk_local_worker.go:436] Get disk info: []
E0320 21:59:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:59:23.409769 543705 memory.go:184] no items to output this cycle
I0320 21:59:23.409797 543705 cpu.go:275] no items to output this cycle
I0320 21:59:26.353673 543705 disk_info.go:125] begin check local disk info of client
I0320 21:59:26.356103 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 21:59:26.356109 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb380 0xc0001fb3c0]
E0320 21:59:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:59:33.409776 543705 memory.go:184] no items to output this cycle
I0320 21:59:33.409782 543705 cpu.go:275] no items to output this cycle
E0320 21:59:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:59:43.409812 543705 memory.go:191] Add success.
I0320 21:59:43.409822 543705 cpu.go:282] Add success.
I0320 21:59:43.419884 543705 net.go:648] Add success.
I0320 21:59:43.422870 543705 net.go:770] primary dev: ETH0
I0320 21:59:43.422883 543705 net.go:802] Send network stats successfully!,count is 6
I0320 21:59:43.422895 543705 net.go:698] Add success.
I0320 21:59:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 21:59:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 21:59:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 21:59:53.410377 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 21:59:53.410397 543705 memory.go:184] no items to output this cycle
I0320 21:59:53.410409 543705 cpu.go:275] no items to output this cycle
E0320 22:00:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:00:03.409797 543705 memory.go:184] no items to output this cycle
I0320 22:00:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 22:00:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:00:13.409807 543705 memory.go:191] Add success.
I0320 22:00:13.409808 543705 cpu.go:282] Add success.
W0320 22:00:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:00:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:00:13.409853 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:00:13.420117 543705 net.go:648] Add success.
I0320 22:00:13.423034 543705 net.go:770] primary dev: ETH0
I0320 22:00:13.423047 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:00:13.423059 543705 net.go:698] Add success.
I0320 22:00:13.463711 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"711cf543-89cd-4520-91b4-25c0ab28e286","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 22:00:13.463757 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 22:00:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:00:14.455202 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:00:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0320 22:00:14.455214 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:00:14.456948 543705 disk_worker.go:494] system disk:vda1
I0320 22:00:14.456985 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:00:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:00:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:00:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:00:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:00:16.472394 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:00:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:00:23.409805 543705 memory.go:184] no items to output this cycle
I0320 22:00:23.409816 543705 cpu.go:275] no items to output this cycle
I0320 22:00:26.357672 543705 disk_info.go:125] begin check local disk info of client
I0320 22:00:26.360166 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:00:26.360171 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004deac0 0xc0004deb00]
E0320 22:00:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:00:33.409803 543705 memory.go:184] no items to output this cycle
I0320 22:00:33.409813 543705 cpu.go:275] no items to output this cycle
I0320 22:00:38.634871 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 22:00:38.634878 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 22:00:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:00:43.410802 543705 memory.go:191] Add success.
I0320 22:00:43.409785 543705 cpu.go:282] Add success.
I0320 22:00:43.420467 543705 net.go:648] Add success.
I0320 22:00:43.423229 543705 net.go:770] primary dev: ETH0
I0320 22:00:43.423243 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:00:43.423255 543705 net.go:698] Add success.
I0320 22:00:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:00:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:00:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:00:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:00:53.409799 543705 memory.go:184] no items to output this cycle
I0320 22:00:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 22:01:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:01:03.409793 543705 memory.go:184] no items to output this cycle
I0320 22:01:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 22:01:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:01:13.409787 543705 cpu.go:282] Add success.
I0320 22:01:13.409796 543705 memory.go:191] Add success.
W0320 22:01:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:01:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:01:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:01:13.420062 543705 net.go:648] Add success.
I0320 22:01:13.422885 543705 net.go:770] primary dev: ETH0
I0320 22:01:13.422898 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:01:13.422912 543705 net.go:698] Add success.
I0320 22:01:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:01:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:01:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0320 22:01:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:01:14.459129 543705 disk_worker.go:494] system disk:vda1
I0320 22:01:14.459157 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:01:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:01:16.457963 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:01:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:01:16.458046 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:01:16.472395 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:01:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:01:23.409769 543705 memory.go:184] no items to output this cycle
I0320 22:01:23.409788 543705 cpu.go:275] no items to output this cycle
I0320 22:01:26.361675 543705 disk_info.go:125] begin check local disk info of client
I0320 22:01:26.364117 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:01:26.364122 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb440 0xc0001fb480]
E0320 22:01:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:01:33.409792 543705 memory.go:184] no items to output this cycle
I0320 22:01:33.409805 543705 cpu.go:275] no items to output this cycle
E0320 22:01:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:01:43.409793 543705 memory.go:191] Add success.
I0320 22:01:43.409794 543705 cpu.go:282] Add success.
I0320 22:01:43.419875 543705 net.go:648] Add success.
I0320 22:01:43.422508 543705 net.go:770] primary dev: ETH0
I0320 22:01:43.422521 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:01:43.422535 543705 net.go:698] Add success.
I0320 22:01:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:01:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:01:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:01:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:01:53.409784 543705 memory.go:184] no items to output this cycle
I0320 22:01:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 22:02:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:02:03.409791 543705 memory.go:184] no items to output this cycle
I0320 22:02:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 22:02:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:02:13.409794 543705 memory.go:191] Add success.
I0320 22:02:13.409798 543705 cpu.go:282] Add success.
W0320 22:02:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:02:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:02:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:02:13.420060 543705 net.go:648] Add success.
I0320 22:02:13.422536 543705 net.go:770] primary dev: ETH0
I0320 22:02:13.422548 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:02:13.422560 543705 net.go:698] Add success.
W0320 22:02:14.455169 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:02:14.455179 543705 disk_worker.go:708] disk space is not compliant
W0320 22:02:14.455183 543705 disk_worker.go:728] disk inode is not compliant
E0320 22:02:14.456159 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 22:02:14.456168 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 22:02:14.456175 543705 custom_config.go:64] query custom config with name: gpu
I0320 22:02:14.456466 543705 disk_worker.go:494] system disk:vda1
I0320 22:02:14.456520 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 22:02:15.456789 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 22:02:15.456799 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:02:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 22:02:16.457971 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 22:02:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:02:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:02:16.472381 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:02:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:02:23.409791 543705 memory.go:184] no items to output this cycle
I0320 22:02:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 22:02:26.365672 543705 disk_info.go:125] begin check local disk info of client
I0320 22:02:26.368111 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:02:26.368118 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a1300 0xc0004a1340]
E0320 22:02:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:02:33.409805 543705 memory.go:184] no items to output this cycle
I0320 22:02:33.409820 543705 cpu.go:275] no items to output this cycle
E0320 22:02:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:02:43.409823 543705 memory.go:191] Add success.
I0320 22:02:43.409825 543705 cpu.go:282] Add success.
I0320 22:02:43.419963 543705 net.go:648] Add success.
I0320 22:02:43.422482 543705 net.go:770] primary dev: ETH0
I0320 22:02:43.422496 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:02:43.422512 543705 net.go:698] Add success.
I0320 22:02:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:02:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:02:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:02:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:02:53.409781 543705 memory.go:184] no items to output this cycle
I0320 22:02:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 22:03:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:03:03.409776 543705 memory.go:184] no items to output this cycle
I0320 22:03:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 22:03:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:03:13.409786 543705 memory.go:191] Add success.
I0320 22:03:13.409807 543705 cpu.go:282] Add success.
W0320 22:03:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:03:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:03:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:03:13.420156 543705 net.go:648] Add success.
I0320 22:03:13.422750 543705 net.go:770] primary dev: ETH0
I0320 22:03:13.422764 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:03:13.422778 543705 net.go:698] Add success.
I0320 22:03:13.468997 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"59192428-3630-4b7a-9816-8bf25288bd5d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 22:03:13.469032 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 22:03:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:03:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:03:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0320 22:03:14.455176 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:03:14.456494 543705 disk_worker.go:494] system disk:vda1
I0320 22:03:14.456541 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:03:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:03:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:03:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:03:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:03:16.472375 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:03:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:03:23.409771 543705 memory.go:184] no items to output this cycle
I0320 22:03:23.409789 543705 cpu.go:275] no items to output this cycle
I0320 22:03:26.369672 543705 disk_info.go:125] begin check local disk info of client
I0320 22:03:26.372157 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:03:26.372162 543705 disk_info.go:196] parse disk info done, disk is : [0xc000359780 0xc0003597c0]
E0320 22:03:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:03:33.409806 543705 memory.go:184] no items to output this cycle
I0320 22:03:33.409821 543705 cpu.go:275] no items to output this cycle
I0320 22:03:38.635890 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 22:03:38.635897 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 22:03:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:03:43.410573 543705 memory.go:191] Add success.
I0320 22:03:43.409785 543705 cpu.go:282] Add success.
I0320 22:03:43.420266 543705 net.go:648] Add success.
I0320 22:03:43.422937 543705 net.go:770] primary dev: ETH0
I0320 22:03:43.422952 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:03:43.422967 543705 net.go:698] Add success.
I0320 22:03:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:03:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:03:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:03:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:03:53.409787 543705 memory.go:184] no items to output this cycle
I0320 22:03:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 22:04:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:04:03.409786 543705 memory.go:184] no items to output this cycle
I0320 22:04:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 22:04:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:04:13.409818 543705 memory.go:191] Add success.
I0320 22:04:13.409825 543705 cpu.go:282] Add success.
W0320 22:04:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:04:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:04:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:04:13.420179 543705 net.go:648] Add success.
I0320 22:04:13.423322 543705 net.go:770] primary dev: ETH0
I0320 22:04:13.423337 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:04:13.423353 543705 net.go:698] Add success.
I0320 22:04:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:04:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:04:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0320 22:04:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:04:14.456604 543705 disk_worker.go:494] system disk:vda1
I0320 22:04:14.456636 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:04:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:04:16.458041 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:04:16.458103 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:04:16.458125 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:04:16.472470 543705 disk_local_worker.go:436] Get disk info: []
I0320 22:04:23.409871 543705 cpu.go:275] no items to output this cycle
E0320 22:04:23.409945 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:04:23.409961 543705 memory.go:184] no items to output this cycle
I0320 22:04:26.373672 543705 disk_info.go:125] begin check local disk info of client
I0320 22:04:26.376174 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:04:26.376181 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003d1d80 0xc0003d1dc0]
E0320 22:04:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:04:33.409778 543705 cpu.go:275] no items to output this cycle
I0320 22:04:33.409785 543705 memory.go:184] no items to output this cycle
E0320 22:04:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:04:43.409777 543705 memory.go:191] Add success.
I0320 22:04:43.409800 543705 cpu.go:282] Add success.
I0320 22:04:43.419899 543705 net.go:648] Add success.
I0320 22:04:43.422653 543705 net.go:770] primary dev: ETH0
I0320 22:04:43.422666 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:04:43.422678 543705 net.go:698] Add success.
I0320 22:04:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:04:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:04:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:04:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:04:53.409781 543705 memory.go:184] no items to output this cycle
I0320 22:04:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 22:05:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:05:03.409788 543705 memory.go:184] no items to output this cycle
I0320 22:05:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 22:05:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:05:13.409801 543705 memory.go:191] Add success.
I0320 22:05:13.409806 543705 cpu.go:282] Add success.
W0320 22:05:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:05:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:05:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:05:13.420199 543705 net.go:648] Add success.
I0320 22:05:13.422989 543705 net.go:770] primary dev: ETH0
I0320 22:05:13.423002 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:05:13.423014 543705 net.go:698] Add success.
I0320 22:05:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:05:14.455146 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:05:14.455156 543705 disk_worker.go:708] disk space is not compliant
W0320 22:05:14.455159 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:05:14.456486 543705 disk_worker.go:494] system disk:vda1
I0320 22:05:14.456533 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:05:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:05:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:05:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:05:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:05:16.472381 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:05:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:05:23.409803 543705 memory.go:184] no items to output this cycle
I0320 22:05:23.409814 543705 cpu.go:275] no items to output this cycle
I0320 22:05:26.377675 543705 disk_info.go:125] begin check local disk info of client
I0320 22:05:26.380156 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:05:26.380163 543705 disk_info.go:196] parse disk info done, disk is : [0xc000354800 0xc000354840]
E0320 22:05:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:05:33.409785 543705 memory.go:184] no items to output this cycle
I0320 22:05:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 22:05:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:05:43.409789 543705 memory.go:191] Add success.
I0320 22:05:43.409816 543705 cpu.go:282] Add success.
I0320 22:05:43.419952 543705 net.go:648] Add success.
I0320 22:05:43.422839 543705 net.go:770] primary dev: ETH0
I0320 22:05:43.422852 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:05:43.422863 543705 net.go:698] Add success.
I0320 22:05:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:05:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:05:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:05:53.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:05:53.409811 543705 memory.go:184] no items to output this cycle
I0320 22:05:53.409822 543705 cpu.go:275] no items to output this cycle
E0320 22:06:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:06:03.409784 543705 memory.go:184] no items to output this cycle
I0320 22:06:03.409790 543705 cpu.go:275] no items to output this cycle
E0320 22:06:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:06:13.409819 543705 memory.go:191] Add success.
I0320 22:06:13.409826 543705 cpu.go:282] Add success.
W0320 22:06:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:06:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:06:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:06:13.420127 543705 net.go:648] Add success.
I0320 22:06:13.422916 543705 net.go:770] primary dev: ETH0
I0320 22:06:13.422928 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:06:13.422940 543705 net.go:698] Add success.
I0320 22:06:13.469202 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"45ff0129-96aa-48a4-93fe-76d5d306b648","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 22:06:13.469237 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 22:06:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:06:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:06:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0320 22:06:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:06:14.456544 543705 disk_worker.go:494] system disk:vda1
I0320 22:06:14.456596 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:06:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:06:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:06:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:06:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:06:16.472403 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:06:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:06:23.409781 543705 memory.go:184] no items to output this cycle
I0320 22:06:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 22:06:26.381668 543705 disk_info.go:125] begin check local disk info of client
I0320 22:06:26.384111 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:06:26.384117 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003de380 0xc0003de3c0]
E0320 22:06:33.409891 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:06:33.409909 543705 memory.go:184] no items to output this cycle
I0320 22:06:33.409960 543705 cpu.go:275] no items to output this cycle
I0320 22:06:38.636885 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 22:06:38.636891 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 22:06:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:06:43.410868 543705 memory.go:191] Add success.
I0320 22:06:43.409823 543705 cpu.go:282] Add success.
I0320 22:06:43.420621 543705 net.go:648] Add success.
I0320 22:06:43.423624 543705 net.go:770] primary dev: ETH0
I0320 22:06:43.423637 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:06:43.423649 543705 net.go:698] Add success.
I0320 22:06:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:06:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:06:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:06:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:06:53.409777 543705 memory.go:184] no items to output this cycle
I0320 22:06:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 22:07:03.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:07:03.409817 543705 memory.go:184] no items to output this cycle
I0320 22:07:03.409831 543705 cpu.go:275] no items to output this cycle
E0320 22:07:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:07:13.409789 543705 memory.go:191] Add success.
I0320 22:07:13.409806 543705 cpu.go:282] Add success.
W0320 22:07:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:07:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:07:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:07:13.420115 543705 net.go:648] Add success.
I0320 22:07:13.422982 543705 net.go:770] primary dev: ETH0
I0320 22:07:13.422996 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:07:13.423008 543705 net.go:698] Add success.
I0320 22:07:13.453558 543705 event_worker.go:152] Polling the log file for events...
W0320 22:07:14.455164 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:07:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0320 22:07:14.455178 543705 disk_worker.go:728] disk inode is not compliant
E0320 22:07:14.455884 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 22:07:14.455893 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 22:07:14.455899 543705 custom_config.go:64] query custom config with name: gpu
I0320 22:07:14.456534 543705 disk_worker.go:494] system disk:vda1
I0320 22:07:14.456564 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 22:07:15.456828 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 22:07:15.456837 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:07:16.457930 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 22:07:16.457929 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 22:07:16.457985 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:07:16.458005 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:07:16.472402 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:07:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:07:23.409807 543705 memory.go:184] no items to output this cycle
I0320 22:07:23.409818 543705 cpu.go:275] no items to output this cycle
I0320 22:07:26.385673 543705 disk_info.go:125] begin check local disk info of client
I0320 22:07:26.388105 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:07:26.388111 543705 disk_info.go:196] parse disk info done, disk is : [0xc00054b100 0xc00054b140]
E0320 22:07:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:07:33.409910 543705 memory.go:184] no items to output this cycle
I0320 22:07:33.409927 543705 cpu.go:275] no items to output this cycle
E0320 22:07:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:07:43.409797 543705 memory.go:191] Add success.
I0320 22:07:43.409814 543705 cpu.go:282] Add success.
I0320 22:07:43.419893 543705 net.go:648] Add success.
I0320 22:07:43.423015 543705 net.go:770] primary dev: ETH0
I0320 22:07:43.423030 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:07:43.423044 543705 net.go:698] Add success.
I0320 22:07:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:07:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:07:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:07:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:07:53.409781 543705 memory.go:184] no items to output this cycle
I0320 22:07:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 22:08:03.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:08:03.409818 543705 memory.go:184] no items to output this cycle
I0320 22:08:03.409831 543705 cpu.go:275] no items to output this cycle
E0320 22:08:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:08:13.409811 543705 memory.go:191] Add success.
I0320 22:08:13.409815 543705 cpu.go:282] Add success.
W0320 22:08:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:08:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:08:13.409855 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:08:13.420121 543705 net.go:648] Add success.
I0320 22:08:13.423362 543705 net.go:770] primary dev: ETH0
I0320 22:08:13.423379 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:08:13.423391 543705 net.go:698] Add success.
I0320 22:08:14.454983 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:08:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:08:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0320 22:08:14.455196 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:08:14.456593 543705 disk_worker.go:494] system disk:vda1
I0320 22:08:14.456626 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:08:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:08:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:08:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:08:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:08:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:08:23.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:08:23.409772 543705 memory.go:184] no items to output this cycle
I0320 22:08:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 22:08:26.391997 543705 disk_info.go:125] begin check local disk info of client
I0320 22:08:26.394553 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:08:26.394559 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ba6c0 0xc0003ba700]
E0320 22:08:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:08:33.409765 543705 memory.go:184] no items to output this cycle
I0320 22:08:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 22:08:43.409879 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:08:43.409907 543705 memory.go:191] Add success.
I0320 22:08:43.409971 543705 cpu.go:282] Add success.
I0320 22:08:43.419708 543705 net.go:648] Add success.
I0320 22:08:43.422479 543705 net.go:770] primary dev: ETH0
I0320 22:08:43.422492 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:08:43.422503 543705 net.go:698] Add success.
I0320 22:08:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:08:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:08:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:08:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:08:53.409782 543705 memory.go:184] no items to output this cycle
I0320 22:08:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 22:09:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:09:03.409807 543705 memory.go:184] no items to output this cycle
I0320 22:09:03.409822 543705 cpu.go:275] no items to output this cycle
E0320 22:09:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:09:13.409792 543705 memory.go:191] Add success.
I0320 22:09:13.409809 543705 cpu.go:282] Add success.
W0320 22:09:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:09:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:09:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:09:13.420194 543705 net.go:648] Add success.
I0320 22:09:13.422962 543705 net.go:770] primary dev: ETH0
I0320 22:09:13.422976 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:09:13.422987 543705 net.go:698] Add success.
I0320 22:09:13.463795 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c97ff5e0-7236-4989-8dba-8a8bbe8721ce","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 22:09:13.463838 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 22:09:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:09:14.455210 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:09:14.455220 543705 disk_worker.go:708] disk space is not compliant
W0320 22:09:14.455223 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:09:14.456746 543705 disk_worker.go:494] system disk:vda1
I0320 22:09:14.456776 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:09:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:09:16.457994 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:09:16.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:09:16.458080 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:09:16.472402 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:09:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:09:23.409785 543705 memory.go:184] no items to output this cycle
I0320 22:09:23.409788 543705 cpu.go:275] no items to output this cycle
I0320 22:09:26.398161 543705 disk_info.go:125] begin check local disk info of client
I0320 22:09:26.400645 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:09:26.400651 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003f0440 0xc0003f0480]
E0320 22:09:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:09:33.409797 543705 memory.go:184] no items to output this cycle
I0320 22:09:33.409810 543705 cpu.go:275] no items to output this cycle
I0320 22:09:38.637739 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 22:09:38.637746 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 22:09:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:09:43.410675 543705 memory.go:191] Add success.
I0320 22:09:43.409808 543705 cpu.go:282] Add success.
I0320 22:09:43.420668 543705 net.go:648] Add success.
I0320 22:09:43.423665 543705 net.go:770] primary dev: ETH0
I0320 22:09:43.423678 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:09:43.423691 543705 net.go:698] Add success.
I0320 22:09:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:09:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:09:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:09:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:09:53.409797 543705 memory.go:184] no items to output this cycle
I0320 22:09:53.409813 543705 cpu.go:275] no items to output this cycle
E0320 22:10:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:10:03.409791 543705 memory.go:184] no items to output this cycle
I0320 22:10:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 22:10:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:10:13.409799 543705 memory.go:191] Add success.
W0320 22:10:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 22:10:13.409832 543705 cpu.go:282] Add success.
W0320 22:10:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:10:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:10:13.420262 543705 net.go:648] Add success.
I0320 22:10:13.422920 543705 net.go:770] primary dev: ETH0
I0320 22:10:13.422934 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:10:13.422947 543705 net.go:698] Add success.
I0320 22:10:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:10:14.455205 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:10:14.455218 543705 disk_worker.go:708] disk space is not compliant
W0320 22:10:14.455221 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:10:14.456586 543705 disk_worker.go:494] system disk:vda1
I0320 22:10:14.456618 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:10:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:10:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:10:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:10:16.458074 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:10:16.472406 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:10:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:10:23.409770 543705 memory.go:184] no items to output this cycle
I0320 22:10:23.409791 543705 cpu.go:275] no items to output this cycle
I0320 22:10:26.401675 543705 disk_info.go:125] begin check local disk info of client
I0320 22:10:26.404151 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:10:26.404157 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003664c0 0xc000366500]
E0320 22:10:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:10:33.409807 543705 memory.go:184] no items to output this cycle
I0320 22:10:33.409817 543705 cpu.go:275] no items to output this cycle
E0320 22:10:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:10:43.409796 543705 memory.go:191] Add success.
I0320 22:10:43.409797 543705 cpu.go:282] Add success.
I0320 22:10:43.419996 543705 net.go:648] Add success.
I0320 22:10:43.422605 543705 net.go:770] primary dev: ETH0
I0320 22:10:43.422620 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:10:43.422634 543705 net.go:698] Add success.
I0320 22:10:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:10:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:10:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:10:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:10:53.409782 543705 memory.go:184] no items to output this cycle
I0320 22:10:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 22:11:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:11:03.409811 543705 memory.go:184] no items to output this cycle
I0320 22:11:03.409825 543705 cpu.go:275] no items to output this cycle
E0320 22:11:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:11:13.409824 543705 memory.go:191] Add success.
I0320 22:11:13.409838 543705 cpu.go:282] Add success.
W0320 22:11:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:11:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:11:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:11:13.420286 543705 net.go:648] Add success.
I0320 22:11:13.423506 543705 net.go:770] primary dev: ETH0
I0320 22:11:13.423521 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:11:13.423535 543705 net.go:698] Add success.
I0320 22:11:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:11:14.455111 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:11:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 22:11:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:11:14.456575 543705 disk_worker.go:494] system disk:vda1
I0320 22:11:14.456606 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:11:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:11:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:11:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:11:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:11:16.472395 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:11:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:11:23.409768 543705 memory.go:184] no items to output this cycle
I0320 22:11:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 22:11:26.405674 543705 disk_info.go:125] begin check local disk info of client
I0320 22:11:26.408180 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:11:26.408188 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003eea80 0xc0003eeac0]
E0320 22:11:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:11:33.409792 543705 memory.go:184] no items to output this cycle
I0320 22:11:33.409805 543705 cpu.go:275] no items to output this cycle
E0320 22:11:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:11:43.409822 543705 memory.go:191] Add success.
I0320 22:11:43.409830 543705 cpu.go:282] Add success.
I0320 22:11:43.419956 543705 net.go:648] Add success.
I0320 22:11:43.423419 543705 net.go:770] primary dev: ETH0
I0320 22:11:43.423433 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:11:43.423447 543705 net.go:698] Add success.
I0320 22:11:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:11:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:11:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:11:53.409848 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:11:53.409866 543705 memory.go:184] no items to output this cycle
I0320 22:11:53.410020 543705 cpu.go:275] no items to output this cycle
E0320 22:12:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:12:03.409789 543705 memory.go:184] no items to output this cycle
I0320 22:12:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 22:12:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:12:13.409802 543705 memory.go:191] Add success.
I0320 22:12:13.409806 543705 cpu.go:282] Add success.
W0320 22:12:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:12:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:12:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:12:13.420136 543705 net.go:648] Add success.
I0320 22:12:13.422752 543705 net.go:770] primary dev: ETH0
I0320 22:12:13.422767 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:12:13.422781 543705 net.go:698] Add success.
I0320 22:12:13.469401 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"13612a44-fc45-42b5-894f-9129459fc1c1","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 22:12:13.469436 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 22:12:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:12:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 22:12:14.455193 543705 disk_worker.go:728] disk inode is not compliant
E0320 22:12:14.455949 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 22:12:14.455958 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 22:12:14.455963 543705 custom_config.go:64] query custom config with name: gpu
I0320 22:12:14.456574 543705 disk_worker.go:494] system disk:vda1
I0320 22:12:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 22:12:15.456806 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 22:12:15.456814 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:12:16.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 22:12:16.457990 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 22:12:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:12:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:12:16.472444 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:12:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:12:23.409791 543705 memory.go:184] no items to output this cycle
I0320 22:12:23.409806 543705 cpu.go:275] no items to output this cycle
I0320 22:12:26.409674 543705 disk_info.go:125] begin check local disk info of client
I0320 22:12:26.412140 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:12:26.412147 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa8c0 0xc0001fa900]
E0320 22:12:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:12:33.409779 543705 memory.go:184] no items to output this cycle
I0320 22:12:33.409805 543705 cpu.go:275] no items to output this cycle
I0320 22:12:38.638896 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 22:12:38.638902 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 22:12:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:12:43.410793 543705 memory.go:191] Add success.
I0320 22:12:43.409804 543705 cpu.go:282] Add success.
I0320 22:12:43.420717 543705 net.go:648] Add success.
I0320 22:12:43.423459 543705 net.go:770] primary dev: ETH0
I0320 22:12:43.423474 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:12:43.423487 543705 net.go:698] Add success.
I0320 22:12:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:12:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:12:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:12:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:12:53.409792 543705 memory.go:184] no items to output this cycle
I0320 22:12:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 22:13:03.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:13:03.409790 543705 memory.go:184] no items to output this cycle
I0320 22:13:03.409798 543705 cpu.go:275] no items to output this cycle
E0320 22:13:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:13:13.409792 543705 memory.go:191] Add success.
W0320 22:13:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 22:13:13.409818 543705 cpu.go:282] Add success.
W0320 22:13:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:13:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:13:13.420162 543705 net.go:648] Add success.
I0320 22:13:13.422765 543705 net.go:770] primary dev: ETH0
I0320 22:13:13.422780 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:13:13.422794 543705 net.go:698] Add success.
I0320 22:13:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:13:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:13:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0320 22:13:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:13:14.456507 543705 disk_worker.go:494] system disk:vda1
I0320 22:13:14.456551 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:13:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:13:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:13:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:13:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:13:16.472404 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:13:23.410502 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:13:23.410517 543705 memory.go:184] no items to output this cycle
I0320 22:13:23.410520 543705 cpu.go:275] no items to output this cycle
I0320 22:13:26.412797 543705 disk_info.go:125] begin check local disk info of client
I0320 22:13:26.415413 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:13:26.415421 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004eec00 0xc0004eec40]
E0320 22:13:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:13:33.409813 543705 memory.go:184] no items to output this cycle
I0320 22:13:33.409830 543705 cpu.go:275] no items to output this cycle
E0320 22:13:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:13:43.409866 543705 memory.go:191] Add success.
I0320 22:13:43.409921 543705 cpu.go:282] Add success.
I0320 22:13:43.419736 543705 net.go:648] Add success.
I0320 22:13:43.422548 543705 net.go:770] primary dev: ETH0
I0320 22:13:43.422560 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:13:43.422572 543705 net.go:698] Add success.
I0320 22:13:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:13:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:13:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:13:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:13:53.409776 543705 memory.go:184] no items to output this cycle
I0320 22:13:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 22:14:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:14:03.409780 543705 memory.go:184] no items to output this cycle
I0320 22:14:03.409787 543705 cpu.go:275] no items to output this cycle
E0320 22:14:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:14:13.409801 543705 memory.go:191] Add success.
I0320 22:14:13.409824 543705 cpu.go:282] Add success.
W0320 22:14:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:14:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:14:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:14:13.420194 543705 net.go:648] Add success.
I0320 22:14:13.422764 543705 net.go:770] primary dev: ETH0
I0320 22:14:13.422777 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:14:13.422789 543705 net.go:698] Add success.
I0320 22:14:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:14:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:14:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0320 22:14:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:14:14.456609 543705 disk_worker.go:494] system disk:vda1
I0320 22:14:14.456639 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:14:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:14:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:14:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:14:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:14:16.472395 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:14:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:14:23.409803 543705 memory.go:184] no items to output this cycle
I0320 22:14:23.409816 543705 cpu.go:275] no items to output this cycle
I0320 22:14:26.415795 543705 disk_info.go:125] begin check local disk info of client
I0320 22:14:26.418290 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:14:26.418297 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002648c0 0xc000264900]
E0320 22:14:33.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:14:33.409758 543705 memory.go:184] no items to output this cycle
I0320 22:14:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 22:14:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:14:43.409802 543705 memory.go:191] Add success.
I0320 22:14:43.409819 543705 cpu.go:282] Add success.
I0320 22:14:43.419988 543705 net.go:648] Add success.
I0320 22:14:43.423240 543705 net.go:770] primary dev: ETH0
I0320 22:14:43.423253 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:14:43.423266 543705 net.go:698] Add success.
I0320 22:14:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:14:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:14:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:14:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:14:53.409768 543705 memory.go:184] no items to output this cycle
I0320 22:14:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 22:15:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:15:03.409780 543705 memory.go:184] no items to output this cycle
I0320 22:15:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 22:15:13.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:15:13.409780 543705 memory.go:191] Add success.
W0320 22:15:13.409806 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 22:15:13.409806 543705 cpu.go:282] Add success.
W0320 22:15:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:15:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:15:13.420139 543705 net.go:648] Add success.
I0320 22:15:13.422940 543705 net.go:770] primary dev: ETH0
I0320 22:15:13.422955 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:15:13.422967 543705 net.go:698] Add success.
I0320 22:15:13.472146 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"47f9314c-d176-4966-b179-6639f8025557","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 22:15:13.472178 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 22:15:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:15:14.455154 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:15:14.455228 543705 disk_worker.go:708] disk space is not compliant
W0320 22:15:14.455231 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:15:14.456770 543705 disk_worker.go:494] system disk:vda1
I0320 22:15:14.456803 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:15:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:15:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:15:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:15:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:15:16.472380 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:15:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:15:23.409771 543705 memory.go:184] no items to output this cycle
I0320 22:15:23.409797 543705 cpu.go:275] no items to output this cycle
I0320 22:15:26.418784 543705 disk_info.go:125] begin check local disk info of client
I0320 22:15:26.421274 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:15:26.421281 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa440 0xc0001fa480]
E0320 22:15:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:15:33.409771 543705 memory.go:184] no items to output this cycle
I0320 22:15:33.409786 543705 cpu.go:275] no items to output this cycle
I0320 22:15:38.639921 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 22:15:38.639928 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 22:15:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:15:43.410684 543705 memory.go:191] Add success.
I0320 22:15:43.409821 543705 cpu.go:282] Add success.
I0320 22:15:43.420439 543705 net.go:648] Add success.
I0320 22:15:43.423066 543705 net.go:770] primary dev: ETH0
I0320 22:15:43.423081 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:15:43.423095 543705 net.go:698] Add success.
I0320 22:15:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:15:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:15:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:15:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:15:53.409771 543705 memory.go:184] no items to output this cycle
I0320 22:15:53.409786 543705 cpu.go:275] no items to output this cycle
E0320 22:16:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:16:03.409784 543705 memory.go:184] no items to output this cycle
I0320 22:16:03.409802 543705 cpu.go:275] no items to output this cycle
E0320 22:16:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:16:13.409799 543705 memory.go:191] Add success.
I0320 22:16:13.409811 543705 cpu.go:282] Add success.
W0320 22:16:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:16:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:16:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:16:13.420258 543705 net.go:648] Add success.
I0320 22:16:13.422897 543705 net.go:770] primary dev: ETH0
I0320 22:16:13.422911 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:16:13.422924 543705 net.go:698] Add success.
I0320 22:16:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:16:14.455100 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:16:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 22:16:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:16:14.456575 543705 disk_worker.go:494] system disk:vda1
I0320 22:16:14.456626 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:16:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:16:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:16:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:16:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:16:16.472418 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:16:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:16:23.409767 543705 memory.go:184] no items to output this cycle
I0320 22:16:23.409795 543705 cpu.go:275] no items to output this cycle
I0320 22:16:26.421795 543705 disk_info.go:125] begin check local disk info of client
I0320 22:16:26.424214 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:16:26.424220 543705 disk_info.go:196] parse disk info done, disk is : [0xc000264980 0xc0002649c0]
E0320 22:16:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:16:33.409779 543705 memory.go:184] no items to output this cycle
I0320 22:16:33.409780 543705 cpu.go:275] no items to output this cycle
E0320 22:16:43.409894 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:16:43.409911 543705 cpu.go:282] Add success.
I0320 22:16:43.409928 543705 memory.go:191] Add success.
I0320 22:16:43.419721 543705 net.go:648] Add success.
I0320 22:16:43.423075 543705 net.go:770] primary dev: ETH0
I0320 22:16:43.423088 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:16:43.423099 543705 net.go:698] Add success.
I0320 22:16:46.457996 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:16:46.458065 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:16:46.458089 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:16:53.410235 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:16:53.410250 543705 memory.go:184] no items to output this cycle
I0320 22:16:53.410267 543705 cpu.go:275] no items to output this cycle
E0320 22:17:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:17:03.409793 543705 cpu.go:275] no items to output this cycle
I0320 22:17:03.409800 543705 memory.go:184] no items to output this cycle
E0320 22:17:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:17:13.409807 543705 memory.go:191] Add success.
I0320 22:17:13.409808 543705 cpu.go:282] Add success.
W0320 22:17:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:17:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:17:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:17:13.420154 543705 net.go:648] Add success.
I0320 22:17:13.422878 543705 net.go:770] primary dev: ETH0
I0320 22:17:13.422891 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:17:13.422903 543705 net.go:698] Add success.
I0320 22:17:13.453493 543705 event_worker.go:152] Polling the log file for events...
W0320 22:17:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:17:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 22:17:14.455194 543705 disk_worker.go:728] disk inode is not compliant
E0320 22:17:14.455928 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 22:17:14.455937 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 22:17:14.455943 543705 custom_config.go:64] query custom config with name: gpu
I0320 22:17:14.456580 543705 disk_worker.go:494] system disk:vda1
I0320 22:17:14.456613 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 22:17:15.456811 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 22:17:15.456819 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:17:16.457951 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 22:17:16.457951 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 22:17:16.458005 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:17:16.458025 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:17:16.472403 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:17:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:17:23.409801 543705 memory.go:184] no items to output this cycle
I0320 22:17:23.409813 543705 cpu.go:275] no items to output this cycle
I0320 22:17:26.424811 543705 disk_info.go:125] begin check local disk info of client
I0320 22:17:26.427325 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:17:26.427331 543705 disk_info.go:196] parse disk info done, disk is : [0xc000290080 0xc0002900c0]
E0320 22:17:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:17:33.409784 543705 memory.go:184] no items to output this cycle
I0320 22:17:33.409799 543705 cpu.go:275] no items to output this cycle
I0320 22:17:43.409894 543705 cpu.go:282] Add success.
E0320 22:17:43.409896 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:17:43.409921 543705 memory.go:191] Add success.
I0320 22:17:43.419713 543705 net.go:648] Add success.
I0320 22:17:43.422969 543705 net.go:770] primary dev: ETH0
I0320 22:17:43.422982 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:17:43.422995 543705 net.go:698] Add success.
I0320 22:17:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:17:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:17:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:17:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:17:53.409775 543705 memory.go:184] no items to output this cycle
I0320 22:17:53.409775 543705 cpu.go:275] no items to output this cycle
E0320 22:18:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:18:03.409778 543705 memory.go:184] no items to output this cycle
I0320 22:18:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 22:18:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:18:13.409785 543705 memory.go:191] Add success.
W0320 22:18:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 22:18:13.409819 543705 cpu.go:282] Add success.
W0320 22:18:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:18:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:18:13.420140 543705 net.go:648] Add success.
I0320 22:18:13.423014 543705 net.go:770] primary dev: ETH0
I0320 22:18:13.423030 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:18:13.423044 543705 net.go:698] Add success.
I0320 22:18:13.469694 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"aa7fe69d-c4c3-46ae-9534-76a238dc0ea2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 22:18:13.469727 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 22:18:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:18:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:18:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 22:18:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:18:14.456769 543705 disk_worker.go:494] system disk:vda1
I0320 22:18:14.456799 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:18:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:18:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:18:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:18:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:18:16.472419 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:18:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:18:23.409774 543705 cpu.go:275] no items to output this cycle
I0320 22:18:23.409774 543705 memory.go:184] no items to output this cycle
I0320 22:18:26.427829 543705 disk_info.go:125] begin check local disk info of client
I0320 22:18:26.430309 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:18:26.430315 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001f8780 0xc0001f87c0]
E0320 22:18:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:18:33.409766 543705 memory.go:184] no items to output this cycle
I0320 22:18:33.409790 543705 cpu.go:275] no items to output this cycle
I0320 22:18:38.640906 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 22:18:38.640913 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 22:18:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:18:43.410710 543705 memory.go:191] Add success.
I0320 22:18:43.409809 543705 cpu.go:282] Add success.
I0320 22:18:43.420420 543705 net.go:648] Add success.
I0320 22:18:43.423475 543705 net.go:770] primary dev: ETH0
I0320 22:18:43.423490 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:18:43.423504 543705 net.go:698] Add success.
I0320 22:18:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:18:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:18:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:18:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:18:53.409773 543705 memory.go:184] no items to output this cycle
I0320 22:18:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 22:19:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:19:03.409808 543705 memory.go:184] no items to output this cycle
I0320 22:19:03.409823 543705 cpu.go:275] no items to output this cycle
E0320 22:19:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:19:13.409819 543705 memory.go:191] Add success.
I0320 22:19:13.409822 543705 cpu.go:282] Add success.
W0320 22:19:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:19:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:19:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:19:13.420136 543705 net.go:648] Add success.
I0320 22:19:13.422693 543705 net.go:770] primary dev: ETH0
I0320 22:19:13.422708 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:19:13.422720 543705 net.go:698] Add success.
I0320 22:19:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:19:14.455148 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:19:14.455158 543705 disk_worker.go:708] disk space is not compliant
W0320 22:19:14.455161 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:19:14.456507 543705 disk_worker.go:494] system disk:vda1
I0320 22:19:14.456552 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:19:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:19:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:19:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:19:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:19:16.472410 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:19:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:19:23.409779 543705 cpu.go:275] no items to output this cycle
I0320 22:19:23.409783 543705 memory.go:184] no items to output this cycle
I0320 22:19:26.430850 543705 disk_info.go:125] begin check local disk info of client
I0320 22:19:26.433348 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:19:26.433355 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a4800 0xc0004a4840]
E0320 22:19:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:19:33.409775 543705 memory.go:184] no items to output this cycle
I0320 22:19:33.409781 543705 cpu.go:275] no items to output this cycle
E0320 22:19:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:19:43.409790 543705 memory.go:191] Add success.
I0320 22:19:43.409794 543705 cpu.go:282] Add success.
I0320 22:19:43.419977 543705 net.go:648] Add success.
I0320 22:19:43.422927 543705 net.go:770] primary dev: ETH0
I0320 22:19:43.422942 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:19:43.422956 543705 net.go:698] Add success.
I0320 22:19:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:19:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:19:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:19:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:19:53.409795 543705 memory.go:184] no items to output this cycle
I0320 22:19:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 22:20:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:20:03.409787 543705 cpu.go:275] no items to output this cycle
I0320 22:20:03.409794 543705 memory.go:184] no items to output this cycle
E0320 22:20:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:20:13.409816 543705 memory.go:191] Add success.
I0320 22:20:13.409830 543705 cpu.go:282] Add success.
W0320 22:20:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:20:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:20:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:20:13.420187 543705 net.go:648] Add success.
I0320 22:20:13.422795 543705 net.go:770] primary dev: ETH0
I0320 22:20:13.422809 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:20:13.422825 543705 net.go:698] Add success.
I0320 22:20:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:20:14.455200 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:20:14.455211 543705 disk_worker.go:708] disk space is not compliant
W0320 22:20:14.455214 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:20:14.456610 543705 disk_worker.go:494] system disk:vda1
I0320 22:20:14.456639 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:20:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:20:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:20:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:20:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:20:16.472398 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:20:23.410383 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:20:23.410398 543705 memory.go:184] no items to output this cycle
I0320 22:20:23.410419 543705 cpu.go:275] no items to output this cycle
I0320 22:20:26.433864 543705 disk_info.go:125] begin check local disk info of client
I0320 22:20:26.436348 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:20:26.436354 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007ad00 0xc00007ad40]
E0320 22:20:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:20:33.409804 543705 memory.go:184] no items to output this cycle
I0320 22:20:33.409817 543705 cpu.go:275] no items to output this cycle
E0320 22:20:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:20:43.409776 543705 memory.go:191] Add success.
I0320 22:20:43.409797 543705 cpu.go:282] Add success.
I0320 22:20:43.419908 543705 net.go:648] Add success.
I0320 22:20:43.422705 543705 net.go:770] primary dev: ETH0
I0320 22:20:43.422718 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:20:43.422730 543705 net.go:698] Add success.
I0320 22:20:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:20:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:20:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:20:53.409887 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:20:53.410011 543705 cpu.go:275] no items to output this cycle
I0320 22:20:53.410018 543705 memory.go:184] no items to output this cycle
E0320 22:21:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:21:03.409783 543705 memory.go:184] no items to output this cycle
I0320 22:21:03.409795 543705 cpu.go:275] no items to output this cycle
E0320 22:21:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:21:13.409794 543705 memory.go:191] Add success.
I0320 22:21:13.409815 543705 cpu.go:282] Add success.
W0320 22:21:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:21:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:21:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:21:13.420198 543705 net.go:648] Add success.
I0320 22:21:13.422803 543705 net.go:770] primary dev: ETH0
I0320 22:21:13.422819 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:21:13.422832 543705 net.go:698] Add success.
I0320 22:21:13.468648 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2c818ea1-80bd-4e7a-b3f1-097d889c1069","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 22:21:13.468682 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 22:21:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:21:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:21:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0320 22:21:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:21:14.456562 543705 disk_worker.go:494] system disk:vda1
I0320 22:21:14.456591 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:21:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:21:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:21:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:21:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:21:16.472394 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:21:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:21:23.409780 543705 memory.go:184] no items to output this cycle
I0320 22:21:23.409781 543705 cpu.go:275] no items to output this cycle
I0320 22:21:26.436867 543705 disk_info.go:125] begin check local disk info of client
I0320 22:21:26.439375 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:21:26.439381 543705 disk_info.go:196] parse disk info done, disk is : [0xc000290ac0 0xc000290b00]
E0320 22:21:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:21:33.409806 543705 memory.go:184] no items to output this cycle
I0320 22:21:33.409817 543705 cpu.go:275] no items to output this cycle
I0320 22:21:38.641737 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 22:21:38.641744 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 22:21:43.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:21:43.410602 543705 memory.go:191] Add success.
I0320 22:21:43.409838 543705 cpu.go:282] Add success.
I0320 22:21:43.420511 543705 net.go:648] Add success.
I0320 22:21:43.423140 543705 net.go:770] primary dev: ETH0
I0320 22:21:43.423154 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:21:43.423165 543705 net.go:698] Add success.
I0320 22:21:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:21:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:21:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:21:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:21:53.409789 543705 memory.go:184] no items to output this cycle
I0320 22:21:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 22:22:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:22:03.409792 543705 memory.go:184] no items to output this cycle
I0320 22:22:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 22:22:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:22:13.409825 543705 memory.go:191] Add success.
I0320 22:22:13.409832 543705 cpu.go:282] Add success.
W0320 22:22:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:22:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:22:13.409878 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:22:13.420198 543705 net.go:648] Add success.
I0320 22:22:13.422978 543705 net.go:770] primary dev: ETH0
I0320 22:22:13.422991 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:22:13.423001 543705 net.go:698] Add success.
W0320 22:22:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:22:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 22:22:14.455188 543705 disk_worker.go:728] disk inode is not compliant
E0320 22:22:14.456678 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 22:22:14.456685 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 22:22:14.456690 543705 custom_config.go:64] query custom config with name: gpu
I0320 22:22:14.456810 543705 disk_worker.go:494] system disk:vda1
I0320 22:22:14.456851 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 22:22:15.456884 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 22:22:15.456893 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:22:16.457917 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 22:22:16.457916 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 22:22:16.457969 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:22:16.457988 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:22:16.472315 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:22:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:22:23.409802 543705 memory.go:184] no items to output this cycle
I0320 22:22:23.409811 543705 cpu.go:275] no items to output this cycle
I0320 22:22:26.439890 543705 disk_info.go:125] begin check local disk info of client
I0320 22:22:26.442361 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:22:26.442368 543705 disk_info.go:196] parse disk info done, disk is : [0xc000464e80 0xc000464ec0]
E0320 22:22:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:22:33.409779 543705 memory.go:184] no items to output this cycle
I0320 22:22:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 22:22:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:22:43.409904 543705 cpu.go:282] Add success.
I0320 22:22:43.409963 543705 memory.go:191] Add success.
I0320 22:22:43.419730 543705 net.go:648] Add success.
I0320 22:22:43.420687 543705 net.go:770] primary dev: ETH0
I0320 22:22:43.420702 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:22:43.420715 543705 net.go:698] Add success.
I0320 22:22:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:22:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:22:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:22:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:22:53.409781 543705 memory.go:184] no items to output this cycle
I0320 22:22:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 22:23:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:23:03.409792 543705 memory.go:184] no items to output this cycle
I0320 22:23:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 22:23:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:23:13.409814 543705 memory.go:191] Add success.
I0320 22:23:13.409819 543705 cpu.go:282] Add success.
W0320 22:23:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:23:13.409876 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:23:13.409880 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:23:13.420229 543705 net.go:648] Add success.
I0320 22:23:13.423329 543705 net.go:770] primary dev: ETH0
I0320 22:23:13.423343 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:23:13.423355 543705 net.go:698] Add success.
I0320 22:23:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:23:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:23:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0320 22:23:14.455170 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:23:14.456505 543705 disk_worker.go:494] system disk:vda1
I0320 22:23:14.456554 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:23:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:23:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:23:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:23:16.458048 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:23:16.472389 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:23:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:23:23.409772 543705 memory.go:184] no items to output this cycle
I0320 22:23:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 22:23:26.442908 543705 disk_info.go:125] begin check local disk info of client
I0320 22:23:26.445371 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:23:26.445378 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003fec40 0xc0003fec80]
E0320 22:23:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:23:33.409776 543705 memory.go:184] no items to output this cycle
I0320 22:23:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 22:23:43.409850 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:23:43.409882 543705 memory.go:191] Add success.
I0320 22:23:43.409952 543705 cpu.go:282] Add success.
I0320 22:23:43.419733 543705 net.go:648] Add success.
I0320 22:23:43.422439 543705 net.go:770] primary dev: ETH0
I0320 22:23:43.422453 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:23:43.422467 543705 net.go:698] Add success.
I0320 22:23:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:23:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:23:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:23:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:23:53.409781 543705 cpu.go:275] no items to output this cycle
I0320 22:23:53.409791 543705 memory.go:184] no items to output this cycle
E0320 22:24:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:24:03.409804 543705 memory.go:184] no items to output this cycle
I0320 22:24:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 22:24:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:24:13.409790 543705 memory.go:191] Add success.
I0320 22:24:13.409809 543705 cpu.go:282] Add success.
W0320 22:24:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:24:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:24:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:24:13.420274 543705 net.go:648] Add success.
I0320 22:24:13.423171 543705 net.go:770] primary dev: ETH0
I0320 22:24:13.423184 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:24:13.423197 543705 net.go:698] Add success.
I0320 22:24:13.463875 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2b5b592b-dcaa-44ee-9c3e-dadb100cbeda","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 22:24:13.463910 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 22:24:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:24:14.455135 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:24:14.455217 543705 disk_worker.go:708] disk space is not compliant
W0320 22:24:14.455221 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:24:14.456717 543705 disk_worker.go:494] system disk:vda1
I0320 22:24:14.456750 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:24:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:24:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:24:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:24:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:24:16.472413 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:24:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:24:23.409776 543705 memory.go:184] no items to output this cycle
I0320 22:24:23.409780 543705 cpu.go:275] no items to output this cycle
I0320 22:24:26.445920 543705 disk_info.go:125] begin check local disk info of client
I0320 22:24:26.448365 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:24:26.448371 543705 disk_info.go:196] parse disk info done, disk is : [0xc000290780 0xc0002907c0]
E0320 22:24:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:24:33.409799 543705 memory.go:184] no items to output this cycle
I0320 22:24:33.409816 543705 cpu.go:275] no items to output this cycle
I0320 22:24:38.641888 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 22:24:38.641894 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 22:24:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:24:43.410647 543705 memory.go:191] Add success.
I0320 22:24:43.409820 543705 cpu.go:282] Add success.
I0320 22:24:43.420347 543705 net.go:648] Add success.
I0320 22:24:43.423430 543705 net.go:770] primary dev: ETH0
I0320 22:24:43.423445 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:24:43.423459 543705 net.go:698] Add success.
I0320 22:24:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:24:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:24:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:24:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:24:53.409778 543705 memory.go:184] no items to output this cycle
I0320 22:24:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 22:25:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:25:03.409773 543705 memory.go:184] no items to output this cycle
I0320 22:25:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 22:25:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:25:13.409785 543705 memory.go:191] Add success.
W0320 22:25:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:25:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:25:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:25:13.409832 543705 cpu.go:282] Add success.
I0320 22:25:13.420206 543705 net.go:648] Add success.
I0320 22:25:13.421295 543705 net.go:770] primary dev: ETH0
I0320 22:25:13.421310 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:25:13.421324 543705 net.go:698] Add success.
I0320 22:25:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:25:14.455097 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:25:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 22:25:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:25:14.456487 543705 disk_worker.go:494] system disk:vda1
I0320 22:25:14.456528 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:25:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:25:16.458000 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:25:16.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:25:16.458083 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:25:16.472443 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:25:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:25:23.409775 543705 cpu.go:275] no items to output this cycle
I0320 22:25:23.409783 543705 memory.go:184] no items to output this cycle
I0320 22:25:26.448927 543705 disk_info.go:125] begin check local disk info of client
I0320 22:25:26.451383 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:25:26.451389 543705 disk_info.go:196] parse disk info done, disk is : [0xc000272d40 0xc000272d80]
E0320 22:25:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:25:33.409776 543705 memory.go:184] no items to output this cycle
I0320 22:25:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 22:25:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:25:43.409821 543705 memory.go:191] Add success.
I0320 22:25:43.409831 543705 cpu.go:282] Add success.
I0320 22:25:43.420020 543705 net.go:648] Add success.
I0320 22:25:43.423115 543705 net.go:770] primary dev: ETH0
I0320 22:25:43.423128 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:25:43.423140 543705 net.go:698] Add success.
I0320 22:25:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:25:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:25:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:25:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:25:53.409796 543705 memory.go:184] no items to output this cycle
I0320 22:25:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 22:26:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:26:03.409809 543705 memory.go:184] no items to output this cycle
I0320 22:26:03.409822 543705 cpu.go:275] no items to output this cycle
E0320 22:26:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:26:13.409792 543705 memory.go:191] Add success.
I0320 22:26:13.409814 543705 cpu.go:282] Add success.
W0320 22:26:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:26:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:26:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:26:13.420699 543705 net.go:648] Add success.
I0320 22:26:13.423950 543705 net.go:770] primary dev: ETH0
I0320 22:26:13.423963 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:26:13.423975 543705 net.go:698] Add success.
I0320 22:26:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:26:14.455146 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:26:14.455156 543705 disk_worker.go:708] disk space is not compliant
W0320 22:26:14.455159 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:26:14.456520 543705 disk_worker.go:494] system disk:vda1
I0320 22:26:14.456567 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:26:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:26:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:26:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:26:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:26:16.472362 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:26:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:26:23.409774 543705 cpu.go:275] no items to output this cycle
I0320 22:26:23.409775 543705 memory.go:184] no items to output this cycle
I0320 22:26:26.451956 543705 disk_info.go:125] begin check local disk info of client
I0320 22:26:26.454439 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:26:26.454445 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab280 0xc0001ab2c0]
E0320 22:26:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:26:33.409799 543705 memory.go:184] no items to output this cycle
I0320 22:26:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 22:26:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:26:43.409800 543705 memory.go:191] Add success.
I0320 22:26:43.409801 543705 cpu.go:282] Add success.
I0320 22:26:43.419878 543705 net.go:648] Add success.
I0320 22:26:43.423005 543705 net.go:770] primary dev: ETH0
I0320 22:26:43.423018 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:26:43.423031 543705 net.go:698] Add success.
I0320 22:26:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:26:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:26:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:26:53.410255 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:26:53.410276 543705 memory.go:184] no items to output this cycle
I0320 22:26:53.410277 543705 cpu.go:275] no items to output this cycle
E0320 22:27:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:27:03.409776 543705 memory.go:184] no items to output this cycle
I0320 22:27:03.409811 543705 cpu.go:275] no items to output this cycle
I0320 22:27:13.409790 543705 cpu.go:282] Add success.
E0320 22:27:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:27:13.409820 543705 memory.go:191] Add success.
W0320 22:27:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:27:13.409875 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:27:13.409879 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:27:13.420116 543705 net.go:648] Add success.
I0320 22:27:13.427418 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 22:27:13.427491 543705 net.go:770] primary dev: ETH0
I0320 22:27:13.427502 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:27:13.427514 543705 net.go:698] Add success.
I0320 22:27:13.453279 543705 event_worker.go:152] Polling the log file for events...
I0320 22:27:13.468582 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f8873223-0f5d-4960-a7f8-55b78b68056e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 22:27:13.468626 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 22:27:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:27:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 22:27:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:27:14.456906 543705 disk_worker.go:494] system disk:vda1
E0320 22:27:14.456927 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 22:27:14.456935 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 22:27:14.456941 543705 custom_config.go:64] query custom config with name: gpu
I0320 22:27:14.456962 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 22:27:15.456807 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 22:27:15.456815 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:27:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 22:27:16.457964 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 22:27:16.458020 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:27:16.458041 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:27:16.472372 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:27:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:27:23.409793 543705 memory.go:184] no items to output this cycle
I0320 22:27:23.409806 543705 cpu.go:275] no items to output this cycle
I0320 22:27:26.454970 543705 disk_info.go:125] begin check local disk info of client
I0320 22:27:26.457435 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:27:26.457444 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa000 0xc0001aa040]
E0320 22:27:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:27:33.409787 543705 memory.go:184] no items to output this cycle
I0320 22:27:33.409792 543705 cpu.go:275] no items to output this cycle
I0320 22:27:38.642916 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 22:27:38.642923 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 22:27:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:27:43.410696 543705 memory.go:191] Add success.
I0320 22:27:43.409798 543705 cpu.go:282] Add success.
I0320 22:27:43.420409 543705 net.go:648] Add success.
I0320 22:27:43.423303 543705 net.go:770] primary dev: ETH0
I0320 22:27:43.423317 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:27:43.423330 543705 net.go:698] Add success.
I0320 22:27:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:27:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:27:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:27:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:27:53.409764 543705 memory.go:184] no items to output this cycle
I0320 22:27:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 22:28:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:28:03.409811 543705 memory.go:184] no items to output this cycle
I0320 22:28:03.409819 543705 cpu.go:275] no items to output this cycle
E0320 22:28:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:28:13.409806 543705 memory.go:191] Add success.
I0320 22:28:13.409807 543705 cpu.go:282] Add success.
W0320 22:28:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:28:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:28:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:28:13.420154 543705 net.go:648] Add success.
I0320 22:28:13.423005 543705 net.go:770] primary dev: ETH0
I0320 22:28:13.423018 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:28:13.423031 543705 net.go:698] Add success.
I0320 22:28:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:28:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:28:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0320 22:28:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:28:14.456607 543705 disk_worker.go:494] system disk:vda1
I0320 22:28:14.456636 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:28:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:28:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:28:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:28:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:28:16.472427 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:28:23.410634 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:28:23.410650 543705 memory.go:184] no items to output this cycle
I0320 22:28:23.410658 543705 cpu.go:275] no items to output this cycle
I0320 22:28:26.457973 543705 disk_info.go:125] begin check local disk info of client
I0320 22:28:26.460426 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:28:26.460434 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002ba640 0xc0002ba680]
E0320 22:28:33.409827 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:28:33.409897 543705 memory.go:184] no items to output this cycle
I0320 22:28:33.409914 543705 cpu.go:275] no items to output this cycle
E0320 22:28:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:28:43.409778 543705 memory.go:191] Add success.
I0320 22:28:43.409815 543705 cpu.go:282] Add success.
I0320 22:28:43.419983 543705 net.go:648] Add success.
I0320 22:28:43.422551 543705 net.go:770] primary dev: ETH0
I0320 22:28:43.422565 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:28:43.422588 543705 net.go:698] Add success.
I0320 22:28:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:28:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:28:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:28:53.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:28:53.409807 543705 memory.go:184] no items to output this cycle
I0320 22:28:53.409818 543705 cpu.go:275] no items to output this cycle
E0320 22:29:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:29:03.409810 543705 memory.go:184] no items to output this cycle
I0320 22:29:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 22:29:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:29:13.409821 543705 memory.go:191] Add success.
I0320 22:29:13.409831 543705 cpu.go:282] Add success.
W0320 22:29:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:29:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:29:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:29:13.420111 543705 net.go:648] Add success.
I0320 22:29:13.422821 543705 net.go:770] primary dev: ETH0
I0320 22:29:13.422835 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:29:13.422847 543705 net.go:698] Add success.
I0320 22:29:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:29:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:29:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 22:29:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:29:14.456601 543705 disk_worker.go:494] system disk:vda1
I0320 22:29:14.456634 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:29:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:29:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:29:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:29:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:29:16.472380 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:29:23.410433 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:29:23.410453 543705 memory.go:184] no items to output this cycle
I0320 22:29:23.410468 543705 cpu.go:275] no items to output this cycle
I0320 22:29:26.460641 543705 disk_info.go:125] begin check local disk info of client
I0320 22:29:26.463148 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:29:26.463155 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e12c0 0xc0003e1300]
E0320 22:29:33.409877 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:29:33.409897 543705 memory.go:184] no items to output this cycle
I0320 22:29:33.409954 543705 cpu.go:275] no items to output this cycle
E0320 22:29:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:29:43.409802 543705 memory.go:191] Add success.
I0320 22:29:43.409819 543705 cpu.go:282] Add success.
I0320 22:29:43.419950 543705 net.go:648] Add success.
I0320 22:29:43.422776 543705 net.go:770] primary dev: ETH0
I0320 22:29:43.422790 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:29:43.422802 543705 net.go:698] Add success.
I0320 22:29:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:29:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:29:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:29:53.410477 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:29:53.410494 543705 memory.go:184] no items to output this cycle
I0320 22:29:53.410515 543705 cpu.go:275] no items to output this cycle
E0320 22:30:03.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:30:03.409811 543705 memory.go:184] no items to output this cycle
I0320 22:30:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 22:30:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:30:13.409814 543705 memory.go:191] Add success.
I0320 22:30:13.409816 543705 cpu.go:282] Add success.
W0320 22:30:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:30:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:30:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:30:13.420136 543705 net.go:648] Add success.
I0320 22:30:13.422718 543705 net.go:770] primary dev: ETH0
I0320 22:30:13.422732 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:30:13.422743 543705 net.go:698] Add success.
I0320 22:30:13.464215 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"9d122166-eb45-4ecf-91cc-6517bc5f4a39","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 22:30:13.464250 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 22:30:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:30:14.455143 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:30:14.455153 543705 disk_worker.go:708] disk space is not compliant
W0320 22:30:14.455156 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:30:14.456529 543705 disk_worker.go:494] system disk:vda1
I0320 22:30:14.456575 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:30:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:30:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:30:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:30:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:30:16.472419 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:30:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:30:23.409783 543705 memory.go:184] no items to output this cycle
I0320 22:30:23.409807 543705 cpu.go:275] no items to output this cycle
I0320 22:30:26.463603 543705 disk_info.go:125] begin check local disk info of client
I0320 22:30:26.466387 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:30:26.466395 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007a000 0xc00007a700]
E0320 22:30:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:30:33.409776 543705 memory.go:184] no items to output this cycle
I0320 22:30:33.409785 543705 cpu.go:275] no items to output this cycle
I0320 22:30:38.643925 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 22:30:38.643931 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 22:30:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:30:43.410720 543705 memory.go:191] Add success.
I0320 22:30:43.409804 543705 cpu.go:282] Add success.
I0320 22:30:43.420431 543705 net.go:648] Add success.
I0320 22:30:43.423181 543705 net.go:770] primary dev: ETH0
I0320 22:30:43.423195 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:30:43.423208 543705 net.go:698] Add success.
I0320 22:30:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:30:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:30:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:30:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:30:53.409778 543705 memory.go:184] no items to output this cycle
I0320 22:30:53.409780 543705 cpu.go:275] no items to output this cycle
E0320 22:31:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:31:03.409809 543705 memory.go:184] no items to output this cycle
I0320 22:31:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 22:31:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:31:13.409784 543705 memory.go:191] Add success.
I0320 22:31:13.409789 543705 cpu.go:282] Add success.
W0320 22:31:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:31:13.412569 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:31:13.412573 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:31:13.420204 543705 net.go:648] Add success.
I0320 22:31:13.421926 543705 net.go:770] primary dev: ETH0
I0320 22:31:13.421940 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:31:13.421964 543705 net.go:698] Add success.
I0320 22:31:14.454990 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:31:14.455187 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:31:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 22:31:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:31:14.456559 543705 disk_worker.go:494] system disk:vda1
I0320 22:31:14.456593 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:31:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:31:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:31:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:31:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:31:16.472393 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:31:23.410207 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:31:23.410223 543705 memory.go:184] no items to output this cycle
I0320 22:31:23.410232 543705 cpu.go:275] no items to output this cycle
I0320 22:31:26.466603 543705 disk_info.go:125] begin check local disk info of client
I0320 22:31:26.469134 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:31:26.469140 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ab740 0xc0003ab780]
E0320 22:31:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:31:33.409776 543705 memory.go:184] no items to output this cycle
I0320 22:31:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 22:31:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:31:43.409778 543705 memory.go:191] Add success.
I0320 22:31:43.409806 543705 cpu.go:282] Add success.
I0320 22:31:43.420054 543705 net.go:648] Add success.
I0320 22:31:43.422981 543705 net.go:770] primary dev: ETH0
I0320 22:31:43.422995 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:31:43.423008 543705 net.go:698] Add success.
I0320 22:31:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:31:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:31:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:31:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:31:53.409794 543705 memory.go:184] no items to output this cycle
I0320 22:31:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 22:32:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:32:03.409787 543705 memory.go:184] no items to output this cycle
I0320 22:32:03.409790 543705 cpu.go:275] no items to output this cycle
E0320 22:32:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:32:13.409820 543705 memory.go:191] Add success.
I0320 22:32:13.409823 543705 cpu.go:282] Add success.
W0320 22:32:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:32:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:32:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:32:13.420202 543705 net.go:648] Add success.
I0320 22:32:13.423459 543705 net.go:770] primary dev: ETH0
I0320 22:32:13.423474 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:32:13.423488 543705 net.go:698] Add success.
W0320 22:32:14.455168 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:32:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0320 22:32:14.455181 543705 disk_worker.go:728] disk inode is not compliant
E0320 22:32:14.456915 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 22:32:14.456924 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 22:32:14.456931 543705 custom_config.go:64] query custom config with name: gpu
I0320 22:32:14.457004 543705 disk_worker.go:494] system disk:vda1
I0320 22:32:14.457048 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 22:32:15.456811 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 22:32:15.456818 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:32:16.457920 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 22:32:16.457920 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 22:32:16.457975 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:32:16.457994 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:32:16.472398 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:32:23.409882 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:32:23.409905 543705 memory.go:184] no items to output this cycle
I0320 22:32:23.409968 543705 cpu.go:275] no items to output this cycle
I0320 22:32:26.469592 543705 disk_info.go:125] begin check local disk info of client
I0320 22:32:26.472175 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:32:26.472181 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034e2c0 0xc00034e300]
E0320 22:32:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:32:33.409779 543705 memory.go:184] no items to output this cycle
I0320 22:32:33.409796 543705 cpu.go:275] no items to output this cycle
E0320 22:32:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:32:43.409782 543705 memory.go:191] Add success.
I0320 22:32:43.409805 543705 cpu.go:282] Add success.
I0320 22:32:43.420042 543705 net.go:648] Add success.
I0320 22:32:43.422791 543705 net.go:770] primary dev: ETH0
I0320 22:32:43.422804 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:32:43.422817 543705 net.go:698] Add success.
I0320 22:32:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:32:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:32:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:32:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:32:53.409776 543705 memory.go:184] no items to output this cycle
I0320 22:32:53.409779 543705 cpu.go:275] no items to output this cycle
E0320 22:33:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:33:03.409799 543705 memory.go:184] no items to output this cycle
I0320 22:33:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 22:33:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:33:13.409777 543705 memory.go:191] Add success.
W0320 22:33:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 22:33:13.409806 543705 cpu.go:282] Add success.
W0320 22:33:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:33:13.409818 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:33:13.420146 543705 net.go:648] Add success.
I0320 22:33:13.422769 543705 net.go:770] primary dev: ETH0
I0320 22:33:13.422785 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:33:13.422800 543705 net.go:698] Add success.
I0320 22:33:13.464345 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b099ddfb-dd1f-45c3-b290-cf55c6b22d92","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 22:33:13.464378 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 22:33:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:33:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:33:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 22:33:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:33:14.456576 543705 disk_worker.go:494] system disk:vda1
I0320 22:33:14.456621 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:33:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:33:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:33:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:33:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:33:16.472089 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:33:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:33:23.409789 543705 memory.go:184] no items to output this cycle
I0320 22:33:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 22:33:26.472707 543705 disk_info.go:125] begin check local disk info of client
I0320 22:33:26.475294 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:33:26.475301 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa400 0xc0001fa440]
E0320 22:33:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:33:33.409778 543705 cpu.go:275] no items to output this cycle
I0320 22:33:33.409785 543705 memory.go:184] no items to output this cycle
I0320 22:33:38.644922 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 22:33:38.644929 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 22:33:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:33:43.410898 543705 memory.go:191] Add success.
I0320 22:33:43.409811 543705 cpu.go:282] Add success.
I0320 22:33:43.420597 543705 net.go:648] Add success.
I0320 22:33:43.423308 543705 net.go:770] primary dev: ETH0
I0320 22:33:43.423321 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:33:43.423334 543705 net.go:698] Add success.
I0320 22:33:46.457998 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:33:46.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:33:46.458087 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:33:53.410389 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:33:53.410406 543705 memory.go:184] no items to output this cycle
I0320 22:33:53.410409 543705 cpu.go:275] no items to output this cycle
E0320 22:34:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:34:03.409781 543705 memory.go:184] no items to output this cycle
I0320 22:34:03.409802 543705 cpu.go:275] no items to output this cycle
W0320 22:34:13.409715 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:34:13.409739 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:34:13.409745 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0320 22:34:13.409839 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:34:13.409862 543705 memory.go:191] Add success.
I0320 22:34:13.409863 543705 cpu.go:282] Add success.
I0320 22:34:13.420224 543705 net.go:648] Add success.
I0320 22:34:13.423194 543705 net.go:770] primary dev: ETH0
I0320 22:34:13.423209 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:34:13.423223 543705 net.go:698] Add success.
I0320 22:34:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:34:14.455111 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:34:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 22:34:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:34:14.456594 543705 disk_worker.go:494] system disk:vda1
I0320 22:34:14.456626 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:34:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:34:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:34:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:34:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:34:16.472384 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:34:23.409840 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:34:23.409859 543705 memory.go:184] no items to output this cycle
I0320 22:34:23.409929 543705 cpu.go:275] no items to output this cycle
I0320 22:34:26.475678 543705 disk_info.go:125] begin check local disk info of client
I0320 22:34:26.478448 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:34:26.478454 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034e000 0xc00034e040]
E0320 22:34:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:34:33.409767 543705 memory.go:184] no items to output this cycle
I0320 22:34:33.409790 543705 cpu.go:275] no items to output this cycle
E0320 22:34:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:34:43.409784 543705 memory.go:191] Add success.
I0320 22:34:43.409812 543705 cpu.go:282] Add success.
I0320 22:34:43.419928 543705 net.go:648] Add success.
I0320 22:34:43.422682 543705 net.go:770] primary dev: ETH0
I0320 22:34:43.422696 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:34:43.422710 543705 net.go:698] Add success.
I0320 22:34:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:34:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:34:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:34:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:34:53.409774 543705 memory.go:184] no items to output this cycle
I0320 22:34:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 22:35:03.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:35:03.409813 543705 memory.go:184] no items to output this cycle
I0320 22:35:03.409821 543705 cpu.go:275] no items to output this cycle
E0320 22:35:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:35:13.409819 543705 memory.go:191] Add success.
I0320 22:35:13.409831 543705 cpu.go:282] Add success.
W0320 22:35:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:35:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:35:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:35:13.420214 543705 net.go:648] Add success.
I0320 22:35:13.422808 543705 net.go:770] primary dev: ETH0
I0320 22:35:13.422826 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:35:13.422840 543705 net.go:698] Add success.
I0320 22:35:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:35:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:35:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0320 22:35:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:35:14.456603 543705 disk_worker.go:494] system disk:vda1
I0320 22:35:14.456639 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:35:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:35:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:35:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:35:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:35:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:35:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:35:23.409787 543705 memory.go:184] no items to output this cycle
I0320 22:35:23.409787 543705 cpu.go:275] no items to output this cycle
I0320 22:35:26.478667 543705 disk_info.go:125] begin check local disk info of client
I0320 22:35:26.481410 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:35:26.481417 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004917c0 0xc000491800]
E0320 22:35:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:35:33.409781 543705 cpu.go:275] no items to output this cycle
I0320 22:35:33.409789 543705 memory.go:184] no items to output this cycle
E0320 22:35:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:35:43.409809 543705 memory.go:191] Add success.
I0320 22:35:43.409819 543705 cpu.go:282] Add success.
I0320 22:35:43.420432 543705 net.go:648] Add success.
I0320 22:35:43.423530 543705 net.go:770] primary dev: ETH0
I0320 22:35:43.423554 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:35:43.423566 543705 net.go:698] Add success.
I0320 22:35:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:35:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:35:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:35:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:35:53.409777 543705 memory.go:184] no items to output this cycle
I0320 22:35:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 22:36:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:36:03.409787 543705 memory.go:184] no items to output this cycle
I0320 22:36:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 22:36:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:36:13.409800 543705 memory.go:191] Add success.
I0320 22:36:13.409802 543705 cpu.go:282] Add success.
W0320 22:36:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:36:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:36:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:36:13.420160 543705 net.go:648] Add success.
I0320 22:36:13.422771 543705 net.go:770] primary dev: ETH0
I0320 22:36:13.422784 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:36:13.422795 543705 net.go:698] Add success.
I0320 22:36:13.468317 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"3429a258-826e-423d-b211-ff35fc6d00bb","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 22:36:13.468350 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 22:36:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:36:14.455185 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:36:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 22:36:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:36:14.456621 543705 disk_worker.go:494] system disk:vda1
I0320 22:36:14.456652 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:36:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:36:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:36:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:36:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:36:16.472406 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:36:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:36:23.409792 543705 memory.go:184] no items to output this cycle
I0320 22:36:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 22:36:26.481621 543705 disk_info.go:125] begin check local disk info of client
I0320 22:36:26.484112 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:36:26.484119 543705 disk_info.go:196] parse disk info done, disk is : [0xc000470e80 0xc000470ec0]
E0320 22:36:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:36:33.409807 543705 memory.go:184] no items to output this cycle
I0320 22:36:33.409821 543705 cpu.go:275] no items to output this cycle
I0320 22:36:38.645844 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 22:36:38.645850 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 22:36:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:36:43.410609 543705 memory.go:191] Add success.
I0320 22:36:43.409827 543705 cpu.go:282] Add success.
I0320 22:36:43.420268 543705 net.go:648] Add success.
I0320 22:36:43.422797 543705 net.go:770] primary dev: ETH0
I0320 22:36:43.422810 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:36:43.422822 543705 net.go:698] Add success.
I0320 22:36:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:36:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:36:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:36:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:36:53.409785 543705 memory.go:184] no items to output this cycle
I0320 22:36:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 22:37:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:37:03.409798 543705 memory.go:184] no items to output this cycle
I0320 22:37:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 22:37:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:37:13.409793 543705 memory.go:191] Add success.
I0320 22:37:13.409793 543705 cpu.go:282] Add success.
W0320 22:37:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:37:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:37:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:37:13.420036 543705 net.go:648] Add success.
I0320 22:37:13.422613 543705 net.go:770] primary dev: ETH0
I0320 22:37:13.422629 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:37:13.422641 543705 net.go:698] Add success.
I0320 22:37:13.453179 543705 event_worker.go:152] Polling the log file for events...
W0320 22:37:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:37:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0320 22:37:14.455206 543705 disk_worker.go:728] disk inode is not compliant
E0320 22:37:14.455972 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 22:37:14.455981 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 22:37:14.455988 543705 custom_config.go:64] query custom config with name: gpu
I0320 22:37:14.456601 543705 disk_worker.go:494] system disk:vda1
I0320 22:37:14.456636 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 22:37:15.456851 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 22:37:15.456861 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:37:16.457943 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 22:37:16.457951 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 22:37:16.457994 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:37:16.458012 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:37:16.472350 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:37:23.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:37:23.409811 543705 memory.go:184] no items to output this cycle
I0320 22:37:23.409822 543705 cpu.go:275] no items to output this cycle
I0320 22:37:26.484572 543705 disk_info.go:125] begin check local disk info of client
I0320 22:37:26.487082 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:37:26.487088 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e6440 0xc0003e6480]
E0320 22:37:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:37:33.409803 543705 memory.go:184] no items to output this cycle
I0320 22:37:33.409817 543705 cpu.go:275] no items to output this cycle
E0320 22:37:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:37:43.409819 543705 memory.go:191] Add success.
I0320 22:37:43.409824 543705 cpu.go:282] Add success.
I0320 22:37:43.420689 543705 net.go:648] Add success.
I0320 22:37:43.423433 543705 net.go:770] primary dev: ETH0
I0320 22:37:43.423446 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:37:43.423457 543705 net.go:698] Add success.
I0320 22:37:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:37:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:37:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:37:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:37:53.409803 543705 memory.go:184] no items to output this cycle
I0320 22:37:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 22:38:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:38:03.409793 543705 memory.go:184] no items to output this cycle
I0320 22:38:03.409816 543705 cpu.go:275] no items to output this cycle
E0320 22:38:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:38:13.409789 543705 memory.go:191] Add success.
I0320 22:38:13.409814 543705 cpu.go:282] Add success.
W0320 22:38:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:38:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:38:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:38:13.420145 543705 net.go:648] Add success.
I0320 22:38:13.422713 543705 net.go:770] primary dev: ETH0
I0320 22:38:13.422726 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:38:13.422738 543705 net.go:698] Add success.
I0320 22:38:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:38:14.455183 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:38:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0320 22:38:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:38:14.456572 543705 disk_worker.go:494] system disk:vda1
I0320 22:38:14.456601 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:38:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:38:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:38:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:38:16.458076 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:38:16.472422 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:38:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:38:23.409795 543705 memory.go:184] no items to output this cycle
I0320 22:38:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 22:38:26.487724 543705 disk_info.go:125] begin check local disk info of client
I0320 22:38:26.490330 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:38:26.490336 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003af280 0xc0003af2c0]
E0320 22:38:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:38:33.409799 543705 memory.go:184] no items to output this cycle
I0320 22:38:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 22:38:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:38:43.409885 543705 memory.go:191] Add success.
I0320 22:38:43.409940 543705 cpu.go:282] Add success.
I0320 22:38:43.419711 543705 net.go:648] Add success.
I0320 22:38:43.422275 543705 net.go:770] primary dev: ETH0
I0320 22:38:43.422288 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:38:43.422301 543705 net.go:698] Add success.
I0320 22:38:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:38:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:38:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:38:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:38:53.409782 543705 memory.go:184] no items to output this cycle
I0320 22:38:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 22:39:03.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:39:03.409815 543705 memory.go:184] no items to output this cycle
I0320 22:39:03.409827 543705 cpu.go:275] no items to output this cycle
E0320 22:39:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:39:13.409786 543705 memory.go:191] Add success.
I0320 22:39:13.409811 543705 cpu.go:282] Add success.
W0320 22:39:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:39:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:39:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:39:13.420172 543705 net.go:648] Add success.
I0320 22:39:13.422766 543705 net.go:770] primary dev: ETH0
I0320 22:39:13.422780 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:39:13.422793 543705 net.go:698] Add success.
I0320 22:39:13.498075 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0b90e8d4-6c95-4362-9ae2-e348afc3dc3a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 22:39:13.498109 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 22:39:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:39:14.455168 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:39:14.455250 543705 disk_worker.go:708] disk space is not compliant
W0320 22:39:14.455254 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:39:14.456776 543705 disk_worker.go:494] system disk:vda1
I0320 22:39:14.456812 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:39:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:39:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:39:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:39:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:39:16.472406 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:39:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:39:23.409798 543705 memory.go:184] no items to output this cycle
I0320 22:39:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 22:39:26.490682 543705 disk_info.go:125] begin check local disk info of client
I0320 22:39:26.493213 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:39:26.493220 543705 disk_info.go:196] parse disk info done, disk is : [0xc000381b00 0xc000381b40]
E0320 22:39:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:39:33.409780 543705 memory.go:184] no items to output this cycle
I0320 22:39:33.409797 543705 cpu.go:275] no items to output this cycle
I0320 22:39:38.646939 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 22:39:38.646945 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 22:39:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:39:43.409781 543705 memory.go:191] Add success.
I0320 22:39:43.409792 543705 cpu.go:282] Add success.
I0320 22:39:43.419739 543705 net.go:648] Add success.
I0320 22:39:43.420663 543705 net.go:770] primary dev: ETH0
I0320 22:39:43.420678 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:39:43.420690 543705 net.go:698] Add success.
I0320 22:39:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:39:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:39:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:39:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:39:53.409782 543705 memory.go:184] no items to output this cycle
I0320 22:39:53.409782 543705 cpu.go:275] no items to output this cycle
E0320 22:40:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:40:03.409789 543705 cpu.go:275] no items to output this cycle
I0320 22:40:03.409801 543705 memory.go:184] no items to output this cycle
E0320 22:40:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:40:13.409822 543705 memory.go:191] Add success.
I0320 22:40:13.409829 543705 cpu.go:282] Add success.
W0320 22:40:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:40:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:40:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:40:13.420289 543705 net.go:648] Add success.
I0320 22:40:13.423110 543705 net.go:770] primary dev: ETH0
I0320 22:40:13.423136 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:40:13.423149 543705 net.go:698] Add success.
I0320 22:40:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:40:14.455203 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:40:14.455216 543705 disk_worker.go:708] disk space is not compliant
W0320 22:40:14.455218 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:40:14.456609 543705 disk_worker.go:494] system disk:vda1
I0320 22:40:14.456644 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:40:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:40:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:40:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:40:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:40:16.472438 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:40:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:40:23.409781 543705 cpu.go:275] no items to output this cycle
I0320 22:40:23.409786 543705 memory.go:184] no items to output this cycle
I0320 22:40:26.493747 543705 disk_info.go:125] begin check local disk info of client
I0320 22:40:26.496269 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:40:26.496275 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003f0e00 0xc0003f0e40]
E0320 22:40:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:40:33.409767 543705 memory.go:184] no items to output this cycle
I0320 22:40:33.409794 543705 cpu.go:275] no items to output this cycle
E0320 22:40:43.409882 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:40:43.409912 543705 memory.go:191] Add success.
I0320 22:40:43.409958 543705 cpu.go:282] Add success.
I0320 22:40:43.419744 543705 net.go:648] Add success.
I0320 22:40:43.422348 543705 net.go:770] primary dev: ETH0
I0320 22:40:43.422361 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:40:43.422372 543705 net.go:698] Add success.
I0320 22:40:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:40:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:40:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:40:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:40:53.409768 543705 memory.go:184] no items to output this cycle
I0320 22:40:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 22:41:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:41:03.409790 543705 memory.go:184] no items to output this cycle
I0320 22:41:03.409829 543705 cpu.go:275] no items to output this cycle
E0320 22:41:13.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:41:13.409832 543705 memory.go:191] Add success.
I0320 22:41:13.409835 543705 cpu.go:282] Add success.
W0320 22:41:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:41:13.409875 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:41:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:41:13.420244 543705 net.go:648] Add success.
I0320 22:41:13.422727 543705 net.go:770] primary dev: ETH0
I0320 22:41:13.422741 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:41:13.422752 543705 net.go:698] Add success.
I0320 22:41:14.454978 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:41:14.455200 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:41:14.455211 543705 disk_worker.go:708] disk space is not compliant
W0320 22:41:14.455214 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:41:14.456604 543705 disk_worker.go:494] system disk:vda1
I0320 22:41:14.456642 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:41:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:41:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:41:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:41:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:41:16.472411 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:41:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:41:23.409800 543705 memory.go:184] no items to output this cycle
I0320 22:41:23.409808 543705 cpu.go:275] no items to output this cycle
I0320 22:41:26.496796 543705 disk_info.go:125] begin check local disk info of client
I0320 22:41:26.499251 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:41:26.499258 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039f580 0xc00039f5c0]
E0320 22:41:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:41:33.409777 543705 memory.go:184] no items to output this cycle
I0320 22:41:33.409782 543705 cpu.go:275] no items to output this cycle
E0320 22:41:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:41:43.409905 543705 cpu.go:282] Add success.
I0320 22:41:43.409931 543705 memory.go:191] Add success.
I0320 22:41:43.419709 543705 net.go:648] Add success.
I0320 22:41:43.422362 543705 net.go:770] primary dev: ETH0
I0320 22:41:43.422375 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:41:43.422387 543705 net.go:698] Add success.
I0320 22:41:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:41:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:41:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:41:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:41:53.409763 543705 memory.go:184] no items to output this cycle
I0320 22:41:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 22:42:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:42:03.409785 543705 memory.go:184] no items to output this cycle
I0320 22:42:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 22:42:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:42:13.409798 543705 memory.go:191] Add success.
I0320 22:42:13.409798 543705 cpu.go:282] Add success.
W0320 22:42:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:42:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:42:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:42:13.420110 543705 net.go:648] Add success.
I0320 22:42:13.422983 543705 net.go:770] primary dev: ETH0
I0320 22:42:13.422997 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:42:13.423010 543705 net.go:698] Add success.
I0320 22:42:13.465298 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"dab6900a-5225-496d-b9a3-e59d46a5be16","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 22:42:13.465337 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 22:42:14.455101 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:42:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0320 22:42:14.455165 543705 disk_worker.go:728] disk inode is not compliant
E0320 22:42:14.456967 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 22:42:14.456976 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 22:42:14.456982 543705 custom_config.go:64] query custom config with name: gpu
I0320 22:42:14.457027 543705 disk_worker.go:494] system disk:vda1
I0320 22:42:14.457059 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 22:42:15.456843 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 22:42:15.456851 543705 custom_config.go:64] query custom config with name: huawei_npu
E0320 22:42:16.457958 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 22:42:16.457958 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:42:16.458016 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:42:16.458036 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:42:16.472363 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:42:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:42:23.409793 543705 memory.go:184] no items to output this cycle
I0320 22:42:23.409804 543705 cpu.go:275] no items to output this cycle
I0320 22:42:26.499786 543705 disk_info.go:125] begin check local disk info of client
I0320 22:42:26.502257 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:42:26.502263 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048c600 0xc00048c640]
E0320 22:42:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:42:33.409770 543705 memory.go:184] no items to output this cycle
I0320 22:42:33.409790 543705 cpu.go:275] no items to output this cycle
I0320 22:42:38.647942 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 22:42:38.647948 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 22:42:43.409889 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:42:43.410782 543705 memory.go:191] Add success.
I0320 22:42:43.409984 543705 cpu.go:282] Add success.
I0320 22:42:43.419745 543705 net.go:648] Add success.
I0320 22:42:43.422327 543705 net.go:770] primary dev: ETH0
I0320 22:42:43.422341 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:42:43.422354 543705 net.go:698] Add success.
I0320 22:42:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:42:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:42:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:42:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:42:53.409798 543705 memory.go:184] no items to output this cycle
I0320 22:42:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 22:43:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:43:03.409789 543705 memory.go:184] no items to output this cycle
I0320 22:43:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 22:43:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:43:13.409778 543705 memory.go:191] Add success.
W0320 22:43:13.409804 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 22:43:13.409809 543705 cpu.go:282] Add success.
W0320 22:43:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:43:13.409819 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:43:13.420124 543705 net.go:648] Add success.
I0320 22:43:13.422584 543705 net.go:770] primary dev: ETH0
I0320 22:43:13.422597 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:43:13.422610 543705 net.go:698] Add success.
I0320 22:43:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:43:14.455204 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:43:14.455218 543705 disk_worker.go:708] disk space is not compliant
W0320 22:43:14.455221 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:43:14.456623 543705 disk_worker.go:494] system disk:vda1
I0320 22:43:14.456654 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:43:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:43:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:43:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:43:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:43:16.472385 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:43:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:43:23.409779 543705 cpu.go:275] no items to output this cycle
I0320 22:43:23.409784 543705 memory.go:184] no items to output this cycle
I0320 22:43:26.502723 543705 disk_info.go:125] begin check local disk info of client
I0320 22:43:26.505189 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:43:26.505195 543705 disk_info.go:196] parse disk info done, disk is : [0xc000460d00 0xc000460d40]
E0320 22:43:33.409742 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:43:33.409757 543705 memory.go:184] no items to output this cycle
I0320 22:43:33.409797 543705 cpu.go:275] no items to output this cycle
E0320 22:43:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:43:43.409887 543705 cpu.go:282] Add success.
I0320 22:43:43.409911 543705 memory.go:191] Add success.
I0320 22:43:43.419719 543705 net.go:648] Add success.
I0320 22:43:43.422319 543705 net.go:770] primary dev: ETH0
I0320 22:43:43.422334 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:43:43.422348 543705 net.go:698] Add success.
I0320 22:43:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:43:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:43:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:43:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:43:53.409797 543705 memory.go:184] no items to output this cycle
I0320 22:43:53.409801 543705 cpu.go:275] no items to output this cycle
E0320 22:44:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:44:03.409795 543705 memory.go:184] no items to output this cycle
I0320 22:44:03.409799 543705 cpu.go:275] no items to output this cycle
E0320 22:44:13.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:44:13.409831 543705 memory.go:191] Add success.
I0320 22:44:13.409833 543705 cpu.go:282] Add success.
W0320 22:44:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:44:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:44:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:44:13.420368 543705 net.go:648] Add success.
I0320 22:44:13.423447 543705 net.go:770] primary dev: ETH0
I0320 22:44:13.423462 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:44:13.423478 543705 net.go:698] Add success.
I0320 22:44:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:44:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:44:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0320 22:44:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:44:14.456491 543705 disk_worker.go:494] system disk:vda1
I0320 22:44:14.456535 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:44:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:44:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:44:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:44:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:44:16.472455 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:44:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:44:23.409801 543705 memory.go:184] no items to output this cycle
I0320 22:44:23.409815 543705 cpu.go:275] no items to output this cycle
I0320 22:44:26.505826 543705 disk_info.go:125] begin check local disk info of client
I0320 22:44:26.508551 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:44:26.508557 543705 disk_info.go:196] parse disk info done, disk is : [0xc00046ce00 0xc00046ce40]
E0320 22:44:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:44:33.409789 543705 memory.go:184] no items to output this cycle
I0320 22:44:33.409793 543705 cpu.go:275] no items to output this cycle
E0320 22:44:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:44:43.409810 543705 memory.go:191] Add success.
I0320 22:44:43.409811 543705 cpu.go:282] Add success.
I0320 22:44:43.419961 543705 net.go:648] Add success.
I0320 22:44:43.422572 543705 net.go:770] primary dev: ETH0
I0320 22:44:43.422586 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:44:43.422597 543705 net.go:698] Add success.
I0320 22:44:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:44:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:44:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:44:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:44:53.409788 543705 memory.go:184] no items to output this cycle
I0320 22:44:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 22:45:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:45:03.409794 543705 memory.go:184] no items to output this cycle
I0320 22:45:03.409831 543705 cpu.go:275] no items to output this cycle
E0320 22:45:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:45:13.409821 543705 memory.go:191] Add success.
I0320 22:45:13.409829 543705 cpu.go:282] Add success.
W0320 22:45:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:45:13.409873 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:45:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:45:13.420207 543705 net.go:648] Add success.
I0320 22:45:13.423031 543705 net.go:770] primary dev: ETH0
I0320 22:45:13.423046 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:45:13.423061 543705 net.go:698] Add success.
I0320 22:45:13.469245 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7e63d0f8-3f93-42b9-9ef7-8e4be49074e6","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 22:45:13.469277 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 22:45:14.454061 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:45:14.454255 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:45:14.454265 543705 disk_worker.go:708] disk space is not compliant
W0320 22:45:14.454268 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:45:14.455600 543705 disk_worker.go:494] system disk:vda1
I0320 22:45:14.455647 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:45:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:45:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:45:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:45:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:45:16.472480 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:45:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:45:23.409780 543705 memory.go:184] no items to output this cycle
I0320 22:45:23.409782 543705 cpu.go:275] no items to output this cycle
I0320 22:45:26.508757 543705 disk_info.go:125] begin check local disk info of client
I0320 22:45:26.511494 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:45:26.511501 543705 disk_info.go:196] parse disk info done, disk is : [0xc000340600 0xc000340640]
E0320 22:45:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:45:33.409807 543705 memory.go:184] no items to output this cycle
I0320 22:45:33.409825 543705 cpu.go:275] no items to output this cycle
I0320 22:45:38.648938 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 22:45:38.648950 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 22:45:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:45:43.410644 543705 memory.go:191] Add success.
I0320 22:45:43.409815 543705 cpu.go:282] Add success.
I0320 22:45:43.420329 543705 net.go:648] Add success.
I0320 22:45:43.422836 543705 net.go:770] primary dev: ETH0
I0320 22:45:43.422861 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:45:43.422874 543705 net.go:698] Add success.
I0320 22:45:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:45:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:45:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:45:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:45:53.409781 543705 memory.go:184] no items to output this cycle
I0320 22:45:53.409784 543705 cpu.go:275] no items to output this cycle
E0320 22:46:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:46:03.409779 543705 memory.go:184] no items to output this cycle
I0320 22:46:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 22:46:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:46:13.409826 543705 memory.go:191] Add success.
I0320 22:46:13.409832 543705 cpu.go:282] Add success.
W0320 22:46:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:46:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:46:13.409878 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:46:13.420202 543705 net.go:648] Add success.
I0320 22:46:13.422853 543705 net.go:770] primary dev: ETH0
I0320 22:46:13.422866 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:46:13.422878 543705 net.go:698] Add success.
I0320 22:46:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:46:14.455129 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:46:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0320 22:46:14.455217 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:46:14.456613 543705 disk_worker.go:494] system disk:vda1
I0320 22:46:14.456643 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:46:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:46:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:46:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:46:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:46:16.472409 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:46:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:46:23.409773 543705 memory.go:184] no items to output this cycle
I0320 22:46:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 22:46:26.511761 543705 disk_info.go:125] begin check local disk info of client
I0320 22:46:26.514256 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:46:26.514262 543705 disk_info.go:196] parse disk info done, disk is : [0xc000264e80 0xc000264ec0]
E0320 22:46:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:46:33.409796 543705 memory.go:184] no items to output this cycle
I0320 22:46:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 22:46:43.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:46:43.409775 543705 memory.go:191] Add success.
I0320 22:46:43.409822 543705 cpu.go:282] Add success.
I0320 22:46:43.419896 543705 net.go:648] Add success.
I0320 22:46:43.422956 543705 net.go:770] primary dev: ETH0
I0320 22:46:43.422986 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:46:43.423001 543705 net.go:698] Add success.
I0320 22:46:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:46:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:46:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:46:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:46:53.409798 543705 memory.go:184] no items to output this cycle
I0320 22:46:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 22:47:03.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:47:03.409819 543705 memory.go:184] no items to output this cycle
I0320 22:47:03.409831 543705 cpu.go:275] no items to output this cycle
E0320 22:47:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:47:13.409795 543705 memory.go:191] Add success.
I0320 22:47:13.409814 543705 cpu.go:282] Add success.
W0320 22:47:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:47:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:47:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:47:13.420176 543705 net.go:648] Add success.
I0320 22:47:13.422813 543705 net.go:770] primary dev: ETH0
I0320 22:47:13.422826 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:47:13.422838 543705 net.go:698] Add success.
I0320 22:47:13.453369 543705 event_worker.go:152] Polling the log file for events...
W0320 22:47:14.454299 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:47:14.454395 543705 disk_worker.go:708] disk space is not compliant
W0320 22:47:14.454400 543705 disk_worker.go:728] disk inode is not compliant
E0320 22:47:14.454901 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 22:47:14.454910 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 22:47:14.454917 543705 custom_config.go:64] query custom config with name: gpu
I0320 22:47:14.455970 543705 disk_worker.go:494] system disk:vda1
I0320 22:47:14.456028 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 22:47:15.456841 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 22:47:15.456849 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:47:16.457933 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 22:47:16.457933 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 22:47:16.457996 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:47:16.458016 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:47:16.472358 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:47:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:47:23.409778 543705 memory.go:184] no items to output this cycle
I0320 22:47:23.409780 543705 cpu.go:275] no items to output this cycle
I0320 22:47:26.514726 543705 disk_info.go:125] begin check local disk info of client
I0320 22:47:26.517179 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:47:26.517185 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004ae540 0xc0004ae580]
E0320 22:47:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:47:33.409805 543705 memory.go:184] no items to output this cycle
I0320 22:47:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 22:47:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:47:43.409795 543705 memory.go:191] Add success.
I0320 22:47:43.409797 543705 cpu.go:282] Add success.
I0320 22:47:43.419848 543705 net.go:648] Add success.
I0320 22:47:43.422225 543705 net.go:770] primary dev: ETH0
I0320 22:47:43.422240 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:47:43.422255 543705 net.go:698] Add success.
I0320 22:47:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:47:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:47:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:47:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:47:53.409775 543705 memory.go:184] no items to output this cycle
I0320 22:47:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 22:48:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:48:03.409788 543705 memory.go:184] no items to output this cycle
I0320 22:48:03.409808 543705 cpu.go:275] no items to output this cycle
E0320 22:48:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:48:13.409821 543705 memory.go:191] Add success.
I0320 22:48:13.409836 543705 cpu.go:282] Add success.
W0320 22:48:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:48:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:48:13.409861 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:48:13.420470 543705 net.go:648] Add success.
I0320 22:48:13.423636 543705 net.go:770] primary dev: ETH0
I0320 22:48:13.423649 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:48:13.423661 543705 net.go:698] Add success.
I0320 22:48:13.469762 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"bab4a59f-de75-47ef-bc6b-4e7a08ef4c1b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 22:48:13.469795 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 22:48:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:48:14.455101 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:48:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0320 22:48:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:48:14.456533 543705 disk_worker.go:494] system disk:vda1
I0320 22:48:14.456585 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:48:15.455612 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:48:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:48:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:48:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:48:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:48:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:48:23.409797 543705 memory.go:184] no items to output this cycle
I0320 22:48:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 22:48:26.517778 543705 disk_info.go:125] begin check local disk info of client
I0320 22:48:26.520225 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:48:26.520231 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fbd00 0xc0001fbd40]
E0320 22:48:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:48:33.409765 543705 memory.go:184] no items to output this cycle
I0320 22:48:33.409793 543705 cpu.go:275] no items to output this cycle
I0320 22:48:38.649735 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 22:48:38.649742 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 22:48:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:48:43.410656 543705 memory.go:191] Add success.
I0320 22:48:43.409797 543705 cpu.go:282] Add success.
I0320 22:48:43.420458 543705 net.go:648] Add success.
I0320 22:48:43.423130 543705 net.go:770] primary dev: ETH0
I0320 22:48:43.423143 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:48:43.423156 543705 net.go:698] Add success.
I0320 22:48:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:48:46.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:48:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:48:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:48:53.409793 543705 memory.go:184] no items to output this cycle
I0320 22:48:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 22:49:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:49:03.409784 543705 memory.go:184] no items to output this cycle
I0320 22:49:03.409813 543705 cpu.go:275] no items to output this cycle
E0320 22:49:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:49:13.409797 543705 memory.go:191] Add success.
I0320 22:49:13.409800 543705 cpu.go:282] Add success.
W0320 22:49:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:49:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:49:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:49:13.420155 543705 net.go:648] Add success.
I0320 22:49:13.422708 543705 net.go:770] primary dev: ETH0
I0320 22:49:13.422722 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:49:13.422736 543705 net.go:698] Add success.
W0320 22:49:14.455257 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:49:14.455277 543705 disk_worker.go:708] disk space is not compliant
W0320 22:49:14.455281 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:49:14.455633 543705 custom_config.go:64] query custom config with name: gpu
I0320 22:49:14.457451 543705 disk_worker.go:494] system disk:vda1
I0320 22:49:14.457497 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:49:15.455976 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:49:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:49:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:49:16.458076 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:49:16.472491 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:49:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:49:23.409784 543705 memory.go:184] no items to output this cycle
I0320 22:49:23.409789 543705 cpu.go:275] no items to output this cycle
I0320 22:49:26.520904 543705 disk_info.go:125] begin check local disk info of client
I0320 22:49:26.523427 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:49:26.523434 543705 disk_info.go:196] parse disk info done, disk is : [0xc000590580 0xc0005905c0]
E0320 22:49:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:49:33.409807 543705 memory.go:184] no items to output this cycle
I0320 22:49:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 22:49:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:49:43.409788 543705 memory.go:191] Add success.
I0320 22:49:43.409804 543705 cpu.go:282] Add success.
I0320 22:49:43.419857 543705 net.go:648] Add success.
I0320 22:49:43.422484 543705 net.go:770] primary dev: ETH0
I0320 22:49:43.422497 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:49:43.422509 543705 net.go:698] Add success.
I0320 22:49:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:49:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:49:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:49:53.410272 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:49:53.410287 543705 memory.go:184] no items to output this cycle
I0320 22:49:53.410292 543705 cpu.go:275] no items to output this cycle
E0320 22:50:03.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:50:03.409808 543705 memory.go:184] no items to output this cycle
I0320 22:50:03.409825 543705 cpu.go:275] no items to output this cycle
E0320 22:50:13.409940 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:50:13.409969 543705 memory.go:191] Add success.
W0320 22:50:13.410006 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 22:50:13.410008 543705 cpu.go:282] Add success.
W0320 22:50:13.410020 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:50:13.410023 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:50:13.419712 543705 net.go:648] Add success.
I0320 22:50:13.422228 543705 net.go:770] primary dev: ETH0
I0320 22:50:13.422251 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:50:13.422263 543705 net.go:698] Add success.
I0320 22:50:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:50:14.455166 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:50:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0320 22:50:14.455179 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:50:14.456594 543705 disk_worker.go:494] system disk:vda1
I0320 22:50:14.456625 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:50:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:50:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:50:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:50:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:50:16.472433 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:50:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:50:23.409775 543705 memory.go:184] no items to output this cycle
I0320 22:50:23.409781 543705 cpu.go:275] no items to output this cycle
I0320 22:50:26.523515 543705 disk_info.go:125] begin check local disk info of client
I0320 22:50:26.525975 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:50:26.525981 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001facc0 0xc0001fad00]
E0320 22:50:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:50:33.409777 543705 memory.go:184] no items to output this cycle
I0320 22:50:33.409782 543705 cpu.go:275] no items to output this cycle
E0320 22:50:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:50:43.409813 543705 memory.go:191] Add success.
I0320 22:50:43.409822 543705 cpu.go:282] Add success.
I0320 22:50:43.420037 543705 net.go:648] Add success.
I0320 22:50:43.423144 543705 net.go:770] primary dev: ETH0
I0320 22:50:43.423158 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:50:43.423170 543705 net.go:698] Add success.
I0320 22:50:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:50:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:50:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:50:53.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:50:53.409800 543705 memory.go:184] no items to output this cycle
I0320 22:50:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 22:51:03.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:51:03.409813 543705 memory.go:184] no items to output this cycle
I0320 22:51:03.409826 543705 cpu.go:275] no items to output this cycle
E0320 22:51:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:51:13.409805 543705 memory.go:191] Add success.
I0320 22:51:13.409807 543705 cpu.go:282] Add success.
W0320 22:51:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:51:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:51:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:51:13.420280 543705 net.go:648] Add success.
I0320 22:51:13.422712 543705 net.go:770] primary dev: ETH0
I0320 22:51:13.422726 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:51:13.422737 543705 net.go:698] Add success.
I0320 22:51:13.548896 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c3c27e22-1a37-45ad-8227-c9253af49e99","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 22:51:13.548928 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 22:51:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:51:14.455136 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:51:14.455146 543705 disk_worker.go:708] disk space is not compliant
W0320 22:51:14.455149 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:51:14.456506 543705 disk_worker.go:494] system disk:vda1
I0320 22:51:14.456547 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:51:15.455981 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:51:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:51:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:51:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:51:16.472451 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:51:23.410242 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:51:23.410260 543705 memory.go:184] no items to output this cycle
I0320 22:51:23.410272 543705 cpu.go:275] no items to output this cycle
I0320 22:51:26.526728 543705 disk_info.go:125] begin check local disk info of client
I0320 22:51:26.529204 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:51:26.529209 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa180 0xc0001fa1c0]
E0320 22:51:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:51:33.409800 543705 memory.go:184] no items to output this cycle
I0320 22:51:33.409811 543705 cpu.go:275] no items to output this cycle
I0320 22:51:38.650963 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 22:51:38.650970 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 22:51:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:51:43.410730 543705 memory.go:191] Add success.
I0320 22:51:43.409830 543705 cpu.go:282] Add success.
I0320 22:51:43.420288 543705 net.go:770] primary dev: ETH0
I0320 22:51:43.420301 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:51:43.420313 543705 net.go:698] Add success.
I0320 22:51:43.420642 543705 net.go:648] Add success.
I0320 22:51:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:51:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:51:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:51:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:51:53.409777 543705 memory.go:184] no items to output this cycle
I0320 22:51:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 22:52:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:52:03.409784 543705 memory.go:184] no items to output this cycle
I0320 22:52:03.409860 543705 cpu.go:275] no items to output this cycle
W0320 22:52:13.409715 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:52:13.409737 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:52:13.409743 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:52:13.409833 543705 cpu.go:282] Add success.
E0320 22:52:13.409836 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:52:13.409856 543705 memory.go:191] Add success.
I0320 22:52:13.420370 543705 net.go:648] Add success.
I0320 22:52:13.423569 543705 net.go:770] primary dev: ETH0
I0320 22:52:13.423582 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:52:13.423594 543705 net.go:698] Add success.
W0320 22:52:14.455107 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:52:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0320 22:52:14.455172 543705 disk_worker.go:728] disk inode is not compliant
E0320 22:52:14.455891 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 22:52:14.455900 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 22:52:14.455905 543705 custom_config.go:64] query custom config with name: gpu
I0320 22:52:14.456559 543705 disk_worker.go:494] system disk:vda1
I0320 22:52:14.456589 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 22:52:15.456839 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 22:52:15.456848 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:52:16.457947 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 22:52:16.457946 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 22:52:16.458000 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:52:16.458020 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:52:16.472398 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:52:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:52:23.409783 543705 memory.go:184] no items to output this cycle
I0320 22:52:23.409788 543705 cpu.go:275] no items to output this cycle
I0320 22:52:26.529934 543705 disk_info.go:125] begin check local disk info of client
I0320 22:52:26.532452 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:52:26.532458 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5940 0xc0000c5980]
E0320 22:52:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:52:33.409775 543705 memory.go:184] no items to output this cycle
I0320 22:52:33.409783 543705 cpu.go:275] no items to output this cycle
E0320 22:52:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:52:43.409787 543705 memory.go:191] Add success.
I0320 22:52:43.409813 543705 cpu.go:282] Add success.
I0320 22:52:43.420136 543705 net.go:648] Add success.
I0320 22:52:43.422660 543705 net.go:770] primary dev: ETH0
I0320 22:52:43.422674 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:52:43.422686 543705 net.go:698] Add success.
I0320 22:52:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:52:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:52:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:52:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:52:53.409766 543705 memory.go:184] no items to output this cycle
I0320 22:52:53.409798 543705 cpu.go:275] no items to output this cycle
E0320 22:53:03.409816 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:53:03.409829 543705 cpu.go:275] no items to output this cycle
I0320 22:53:03.409837 543705 memory.go:184] no items to output this cycle
E0320 22:53:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:53:13.409829 543705 memory.go:191] Add success.
I0320 22:53:13.409836 543705 cpu.go:282] Add success.
W0320 22:53:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:53:13.409879 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:53:13.409883 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:53:13.420133 543705 net.go:648] Add success.
I0320 22:53:13.422907 543705 net.go:770] primary dev: ETH0
I0320 22:53:13.422920 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:53:13.422931 543705 net.go:698] Add success.
I0320 22:53:14.454951 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:53:14.455099 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:53:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0320 22:53:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:53:14.456559 543705 disk_worker.go:494] system disk:vda1
I0320 22:53:14.456588 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:53:15.455982 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:53:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:53:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:53:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:53:16.472421 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:53:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:53:23.409781 543705 memory.go:184] no items to output this cycle
I0320 22:53:23.409781 543705 cpu.go:275] no items to output this cycle
I0320 22:53:26.532985 543705 disk_info.go:125] begin check local disk info of client
I0320 22:53:26.535509 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:53:26.535516 543705 disk_info.go:196] parse disk info done, disk is : [0xc00049f980 0xc00049f9c0]
E0320 22:53:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:53:33.409789 543705 memory.go:184] no items to output this cycle
I0320 22:53:33.409793 543705 cpu.go:275] no items to output this cycle
E0320 22:53:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:53:43.409801 543705 memory.go:191] Add success.
I0320 22:53:43.409808 543705 cpu.go:282] Add success.
I0320 22:53:43.419890 543705 net.go:648] Add success.
I0320 22:53:43.422477 543705 net.go:770] primary dev: ETH0
I0320 22:53:43.422491 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:53:43.422503 543705 net.go:698] Add success.
I0320 22:53:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:53:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:53:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:53:53.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:53:53.409817 543705 memory.go:184] no items to output this cycle
I0320 22:53:53.409827 543705 cpu.go:275] no items to output this cycle
E0320 22:54:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:54:03.409793 543705 memory.go:184] no items to output this cycle
I0320 22:54:03.409862 543705 cpu.go:275] no items to output this cycle
E0320 22:54:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:54:13.409795 543705 memory.go:191] Add success.
W0320 22:54:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:54:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:54:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:54:13.409845 543705 cpu.go:282] Add success.
I0320 22:54:13.420377 543705 net.go:648] Add success.
I0320 22:54:13.423199 543705 net.go:770] primary dev: ETH0
I0320 22:54:13.423213 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:54:13.423225 543705 net.go:698] Add success.
I0320 22:54:13.463763 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"cd9b646a-7f46-408c-984d-de7ddacd1e12","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 22:54:13.463794 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 22:54:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:54:14.455146 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:54:14.455157 543705 disk_worker.go:708] disk space is not compliant
W0320 22:54:14.455159 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:54:14.456498 543705 disk_worker.go:494] system disk:vda1
I0320 22:54:14.456544 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:54:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:54:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:54:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:54:16.458049 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:54:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:54:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:54:23.409791 543705 memory.go:184] no items to output this cycle
I0320 22:54:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 22:54:26.535930 543705 disk_info.go:125] begin check local disk info of client
I0320 22:54:26.538426 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:54:26.538432 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c54c0 0xc0000c5500]
E0320 22:54:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:54:33.409804 543705 memory.go:184] no items to output this cycle
I0320 22:54:33.409818 543705 cpu.go:275] no items to output this cycle
I0320 22:54:38.651960 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 22:54:38.651966 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 22:54:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:54:43.410769 543705 memory.go:191] Add success.
I0320 22:54:43.409794 543705 cpu.go:282] Add success.
I0320 22:54:43.420486 543705 net.go:648] Add success.
I0320 22:54:43.423283 543705 net.go:770] primary dev: ETH0
I0320 22:54:43.423302 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:54:43.423315 543705 net.go:698] Add success.
I0320 22:54:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:54:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:54:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:54:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:54:53.409781 543705 memory.go:184] no items to output this cycle
I0320 22:54:53.409783 543705 cpu.go:275] no items to output this cycle
E0320 22:55:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:55:03.409781 543705 memory.go:184] no items to output this cycle
I0320 22:55:03.409881 543705 cpu.go:275] no items to output this cycle
E0320 22:55:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:55:13.409799 543705 memory.go:191] Add success.
I0320 22:55:13.409818 543705 cpu.go:282] Add success.
W0320 22:55:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:55:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:55:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:55:13.420338 543705 net.go:648] Add success.
I0320 22:55:13.422949 543705 net.go:770] primary dev: ETH0
I0320 22:55:13.422962 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:55:13.422973 543705 net.go:698] Add success.
I0320 22:55:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:55:14.455166 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:55:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0320 22:55:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:55:14.456570 543705 disk_worker.go:494] system disk:vda1
I0320 22:55:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:55:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:55:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:55:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:55:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:55:16.472473 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:55:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:55:23.409786 543705 memory.go:184] no items to output this cycle
I0320 22:55:23.409789 543705 cpu.go:275] no items to output this cycle
I0320 22:55:26.538735 543705 disk_info.go:125] begin check local disk info of client
I0320 22:55:26.541202 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:55:26.541209 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032eac0 0xc00032eb00]
E0320 22:55:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:55:33.409805 543705 memory.go:184] no items to output this cycle
I0320 22:55:33.409816 543705 cpu.go:275] no items to output this cycle
E0320 22:55:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:55:43.409785 543705 memory.go:191] Add success.
I0320 22:55:43.409806 543705 cpu.go:282] Add success.
I0320 22:55:43.420054 543705 net.go:648] Add success.
I0320 22:55:43.422866 543705 net.go:770] primary dev: ETH0
I0320 22:55:43.422880 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:55:43.422895 543705 net.go:698] Add success.
I0320 22:55:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:55:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:55:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:55:53.410254 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:55:53.410263 543705 cpu.go:275] no items to output this cycle
I0320 22:55:53.410269 543705 memory.go:184] no items to output this cycle
E0320 22:56:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:56:03.409774 543705 memory.go:184] no items to output this cycle
I0320 22:56:03.409839 543705 cpu.go:275] no items to output this cycle
E0320 22:56:13.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:56:13.409824 543705 memory.go:191] Add success.
I0320 22:56:13.409825 543705 cpu.go:282] Add success.
W0320 22:56:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:56:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:56:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:56:13.420288 543705 net.go:648] Add success.
I0320 22:56:13.423147 543705 net.go:770] primary dev: ETH0
I0320 22:56:13.423160 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:56:13.423172 543705 net.go:698] Add success.
I0320 22:56:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:56:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:56:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 22:56:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:56:14.456624 543705 disk_worker.go:494] system disk:vda1
I0320 22:56:14.456655 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:56:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:56:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:56:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:56:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:56:16.472410 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:56:23.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:56:23.409766 543705 memory.go:184] no items to output this cycle
I0320 22:56:23.409787 543705 cpu.go:275] no items to output this cycle
I0320 22:56:26.541969 543705 disk_info.go:125] begin check local disk info of client
I0320 22:56:26.544484 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:56:26.544491 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fbe40 0xc0001fbe80]
E0320 22:56:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:56:33.409799 543705 memory.go:184] no items to output this cycle
I0320 22:56:33.409811 543705 cpu.go:275] no items to output this cycle
E0320 22:56:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:56:43.409801 543705 memory.go:191] Add success.
I0320 22:56:43.409809 543705 cpu.go:282] Add success.
I0320 22:56:43.419951 543705 net.go:648] Add success.
I0320 22:56:43.422489 543705 net.go:770] primary dev: ETH0
I0320 22:56:43.422504 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:56:43.422518 543705 net.go:698] Add success.
I0320 22:56:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:56:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:56:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:56:53.410476 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:56:53.410490 543705 memory.go:184] no items to output this cycle
I0320 22:56:53.410493 543705 cpu.go:275] no items to output this cycle
E0320 22:57:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:57:03.409796 543705 memory.go:184] no items to output this cycle
I0320 22:57:03.409869 543705 cpu.go:275] no items to output this cycle
E0320 22:57:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:57:13.409791 543705 memory.go:191] Add success.
I0320 22:57:13.409793 543705 cpu.go:282] Add success.
W0320 22:57:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:57:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:57:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:57:13.420167 543705 net.go:648] Add success.
I0320 22:57:13.429157 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 22:57:13.429232 543705 net.go:770] primary dev: ETH0
I0320 22:57:13.429243 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:57:13.429255 543705 net.go:698] Add success.
I0320 22:57:13.452772 543705 event_worker.go:152] Polling the log file for events...
I0320 22:57:13.463316 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"6de66bad-cdb3-4b23-9b9c-06ef7d3536c8","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 22:57:13.463352 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 22:57:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:57:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 22:57:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:57:14.456819 543705 disk_worker.go:494] system disk:vda1
I0320 22:57:14.456856 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 22:57:14.457053 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 22:57:14.457061 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 22:57:14.457065 543705 custom_config.go:64] query custom config with name: gpu
E0320 22:57:15.456926 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 22:57:15.456938 543705 custom_config.go:64] query custom config with name: huawei_npu
E0320 22:57:16.457977 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 22:57:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:57:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:57:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:57:16.472459 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:57:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:57:23.409801 543705 memory.go:184] no items to output this cycle
I0320 22:57:23.409814 543705 cpu.go:275] no items to output this cycle
I0320 22:57:26.544930 543705 disk_info.go:125] begin check local disk info of client
I0320 22:57:26.547447 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:57:26.547454 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a8700 0xc0002a8740]
E0320 22:57:33.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:57:33.409768 543705 memory.go:184] no items to output this cycle
I0320 22:57:33.409791 543705 cpu.go:275] no items to output this cycle
I0320 22:57:38.652106 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 22:57:38.652113 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 22:57:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:57:43.410694 543705 memory.go:191] Add success.
I0320 22:57:43.409810 543705 cpu.go:282] Add success.
I0320 22:57:43.419712 543705 net.go:648] Add success.
I0320 22:57:43.422346 543705 net.go:770] primary dev: ETH0
I0320 22:57:43.422358 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:57:43.422371 543705 net.go:698] Add success.
I0320 22:57:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:57:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:57:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:57:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:57:53.409793 543705 memory.go:184] no items to output this cycle
I0320 22:57:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 22:58:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:58:03.409796 543705 memory.go:184] no items to output this cycle
I0320 22:58:03.409815 543705 cpu.go:275] no items to output this cycle
E0320 22:58:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:58:13.409820 543705 memory.go:191] Add success.
I0320 22:58:13.409835 543705 cpu.go:282] Add success.
W0320 22:58:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 22:58:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:58:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:58:13.420180 543705 net.go:648] Add success.
I0320 22:58:13.423105 543705 net.go:770] primary dev: ETH0
I0320 22:58:13.423121 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:58:13.423133 543705 net.go:698] Add success.
I0320 22:58:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:58:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:58:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 22:58:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:58:14.456595 543705 disk_worker.go:494] system disk:vda1
I0320 22:58:14.456626 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:58:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:58:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:58:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:58:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:58:16.472391 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:58:23.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:58:23.409811 543705 memory.go:184] no items to output this cycle
I0320 22:58:23.409821 543705 cpu.go:275] no items to output this cycle
I0320 22:58:26.548055 543705 disk_info.go:125] begin check local disk info of client
I0320 22:58:26.550564 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:58:26.550570 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004632c0 0xc000463300]
E0320 22:58:33.409884 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:58:33.409901 543705 memory.go:184] no items to output this cycle
I0320 22:58:33.409940 543705 cpu.go:275] no items to output this cycle
E0320 22:58:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:58:43.409794 543705 memory.go:191] Add success.
I0320 22:58:43.409799 543705 cpu.go:282] Add success.
I0320 22:58:43.419993 543705 net.go:648] Add success.
I0320 22:58:43.422890 543705 net.go:770] primary dev: ETH0
I0320 22:58:43.422908 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:58:43.422923 543705 net.go:698] Add success.
I0320 22:58:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:58:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:58:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:58:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:58:53.409766 543705 memory.go:184] no items to output this cycle
I0320 22:58:53.409792 543705 cpu.go:275] no items to output this cycle
E0320 22:59:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:59:03.409773 543705 memory.go:184] no items to output this cycle
I0320 22:59:03.409792 543705 cpu.go:275] no items to output this cycle
E0320 22:59:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:59:13.409784 543705 memory.go:191] Add success.
W0320 22:59:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 22:59:13.409812 543705 cpu.go:282] Add success.
W0320 22:59:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 22:59:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 22:59:13.420043 543705 net.go:648] Add success.
I0320 22:59:13.422702 543705 net.go:770] primary dev: ETH0
I0320 22:59:13.422715 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:59:13.422726 543705 net.go:698] Add success.
I0320 22:59:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 22:59:14.455129 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 22:59:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0320 22:59:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 22:59:14.456585 543705 disk_worker.go:494] system disk:vda1
I0320 22:59:14.456615 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 22:59:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 22:59:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:59:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:59:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 22:59:16.472421 543705 disk_local_worker.go:436] Get disk info: []
E0320 22:59:23.410487 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:59:23.410506 543705 memory.go:184] no items to output this cycle
I0320 22:59:23.410518 543705 cpu.go:275] no items to output this cycle
I0320 22:59:26.550918 543705 disk_info.go:125] begin check local disk info of client
I0320 22:59:26.553404 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 22:59:26.553410 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032e000 0xc00032e040]
E0320 22:59:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:59:33.409805 543705 memory.go:184] no items to output this cycle
I0320 22:59:33.409817 543705 cpu.go:275] no items to output this cycle
E0320 22:59:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:59:43.409797 543705 memory.go:191] Add success.
I0320 22:59:43.409800 543705 cpu.go:282] Add success.
I0320 22:59:43.419949 543705 net.go:648] Add success.
I0320 22:59:43.423042 543705 net.go:770] primary dev: ETH0
I0320 22:59:43.423055 543705 net.go:802] Send network stats successfully!,count is 6
I0320 22:59:43.423067 543705 net.go:698] Add success.
I0320 22:59:46.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 22:59:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 22:59:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 22:59:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 22:59:53.409769 543705 memory.go:184] no items to output this cycle
I0320 22:59:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 23:00:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:00:03.409783 543705 memory.go:184] no items to output this cycle
I0320 23:00:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 23:00:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:00:13.409820 543705 memory.go:191] Add success.
I0320 23:00:13.409821 543705 cpu.go:282] Add success.
W0320 23:00:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:00:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:00:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:00:13.420166 543705 net.go:648] Add success.
I0320 23:00:13.423368 543705 net.go:770] primary dev: ETH0
I0320 23:00:13.423383 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:00:13.423397 543705 net.go:698] Add success.
I0320 23:00:13.564123 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"247db265-1b69-44fe-8d56-9452d3b249a7","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 23:00:13.564156 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 23:00:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:00:14.455200 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:00:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0320 23:00:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:00:14.456619 543705 disk_worker.go:494] system disk:vda1
I0320 23:00:14.456649 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:00:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:00:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:00:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:00:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:00:16.472369 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:00:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:00:23.409776 543705 memory.go:184] no items to output this cycle
I0320 23:00:23.409779 543705 cpu.go:275] no items to output this cycle
I0320 23:00:26.553975 543705 disk_info.go:125] begin check local disk info of client
I0320 23:00:26.556413 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:00:26.556420 543705 disk_info.go:196] parse disk info done, disk is : [0xc00027a540 0xc00027a580]
E0320 23:00:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:00:33.409791 543705 memory.go:184] no items to output this cycle
I0320 23:00:33.409806 543705 cpu.go:275] no items to output this cycle
I0320 23:00:38.652248 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 23:00:38.652255 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 23:00:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:00:43.410630 543705 memory.go:191] Add success.
I0320 23:00:43.409822 543705 cpu.go:282] Add success.
I0320 23:00:43.420360 543705 net.go:648] Add success.
I0320 23:00:43.423312 543705 net.go:770] primary dev: ETH0
I0320 23:00:43.423327 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:00:43.423342 543705 net.go:698] Add success.
I0320 23:00:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:00:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:00:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:00:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:00:53.409794 543705 memory.go:184] no items to output this cycle
I0320 23:00:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 23:01:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:01:03.409770 543705 memory.go:184] no items to output this cycle
I0320 23:01:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 23:01:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:01:13.409805 543705 memory.go:191] Add success.
I0320 23:01:13.409805 543705 cpu.go:282] Add success.
W0320 23:01:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:01:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:01:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:01:13.420151 543705 net.go:648] Add success.
I0320 23:01:13.422779 543705 net.go:770] primary dev: ETH0
I0320 23:01:13.422794 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:01:13.422808 543705 net.go:698] Add success.
I0320 23:01:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:01:14.455359 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:01:14.455458 543705 disk_worker.go:708] disk space is not compliant
W0320 23:01:14.455468 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:01:14.457051 543705 disk_worker.go:494] system disk:vda1
I0320 23:01:14.457080 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:01:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:01:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:01:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:01:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:01:16.472483 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:01:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:01:23.409787 543705 memory.go:184] no items to output this cycle
I0320 23:01:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 23:01:26.556947 543705 disk_info.go:125] begin check local disk info of client
I0320 23:01:26.559417 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:01:26.559431 543705 disk_info.go:196] parse disk info done, disk is : [0xc00056c200 0xc00056c240]
E0320 23:01:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:01:33.409771 543705 memory.go:184] no items to output this cycle
I0320 23:01:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 23:01:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:01:43.409807 543705 cpu.go:282] Add success.
I0320 23:01:43.409811 543705 memory.go:191] Add success.
I0320 23:01:43.419904 543705 net.go:648] Add success.
I0320 23:01:43.422617 543705 net.go:770] primary dev: ETH0
I0320 23:01:43.422630 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:01:43.422642 543705 net.go:698] Add success.
I0320 23:01:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:01:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:01:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:01:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:01:53.409799 543705 memory.go:184] no items to output this cycle
I0320 23:01:53.409814 543705 cpu.go:275] no items to output this cycle
E0320 23:02:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:02:03.409787 543705 memory.go:184] no items to output this cycle
I0320 23:02:03.409806 543705 cpu.go:275] no items to output this cycle
E0320 23:02:13.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:02:13.409835 543705 memory.go:191] Add success.
I0320 23:02:13.409842 543705 cpu.go:282] Add success.
W0320 23:02:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:02:13.409883 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:02:13.409887 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:02:13.420239 543705 net.go:648] Add success.
I0320 23:02:13.422871 543705 net.go:770] primary dev: ETH0
I0320 23:02:13.422885 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:02:13.422899 543705 net.go:698] Add success.
W0320 23:02:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:02:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 23:02:14.455185 543705 disk_worker.go:728] disk inode is not compliant
E0320 23:02:14.455903 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 23:02:14.455912 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 23:02:14.455918 543705 custom_config.go:64] query custom config with name: gpu
I0320 23:02:14.456819 543705 disk_worker.go:494] system disk:vda1
I0320 23:02:14.456947 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 23:02:15.456809 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 23:02:15.456818 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:02:16.457945 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 23:02:16.457945 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 23:02:16.458002 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:02:16.458023 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:02:16.472357 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:02:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:02:23.409806 543705 memory.go:184] no items to output this cycle
I0320 23:02:23.409818 543705 cpu.go:275] no items to output this cycle
I0320 23:02:26.560037 543705 disk_info.go:125] begin check local disk info of client
I0320 23:02:26.562486 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:02:26.562492 543705 disk_info.go:196] parse disk info done, disk is : [0xc000387300 0xc000387340]
E0320 23:02:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:02:33.409807 543705 memory.go:184] no items to output this cycle
I0320 23:02:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 23:02:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:02:43.409796 543705 memory.go:191] Add success.
I0320 23:02:43.409823 543705 cpu.go:282] Add success.
I0320 23:02:43.419979 543705 net.go:648] Add success.
I0320 23:02:43.422865 543705 net.go:770] primary dev: ETH0
I0320 23:02:43.422878 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:02:43.422891 543705 net.go:698] Add success.
I0320 23:02:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:02:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:02:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:02:53.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:02:53.409810 543705 memory.go:184] no items to output this cycle
I0320 23:02:53.409820 543705 cpu.go:275] no items to output this cycle
E0320 23:03:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:03:03.409788 543705 memory.go:184] no items to output this cycle
I0320 23:03:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 23:03:13.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:03:13.409831 543705 memory.go:191] Add success.
I0320 23:03:13.409844 543705 cpu.go:282] Add success.
W0320 23:03:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:03:13.409880 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:03:13.409884 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:03:13.420134 543705 net.go:648] Add success.
I0320 23:03:13.422913 543705 net.go:770] primary dev: ETH0
I0320 23:03:13.422928 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:03:13.422943 543705 net.go:698] Add success.
I0320 23:03:13.507211 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7034b473-324a-4a68-ab62-26055e1696f9","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 23:03:13.507243 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 23:03:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:03:14.455119 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:03:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0320 23:03:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:03:14.456848 543705 disk_worker.go:494] system disk:vda1
I0320 23:03:14.456877 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:03:15.455631 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:03:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:03:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:03:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:03:16.472427 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:03:23.410456 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:03:23.410475 543705 memory.go:184] no items to output this cycle
I0320 23:03:23.410490 543705 cpu.go:275] no items to output this cycle
I0320 23:03:26.563001 543705 disk_info.go:125] begin check local disk info of client
I0320 23:03:26.565482 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:03:26.565488 543705 disk_info.go:196] parse disk info done, disk is : [0xc00046d340 0xc00046d380]
E0320 23:03:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:03:33.409795 543705 memory.go:184] no items to output this cycle
I0320 23:03:33.409821 543705 cpu.go:275] no items to output this cycle
I0320 23:03:38.652966 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 23:03:38.652973 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 23:03:43.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:03:43.410746 543705 memory.go:191] Add success.
I0320 23:03:43.409847 543705 cpu.go:282] Add success.
I0320 23:03:43.420455 543705 net.go:648] Add success.
I0320 23:03:43.423255 543705 net.go:770] primary dev: ETH0
I0320 23:03:43.423271 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:03:43.423285 543705 net.go:698] Add success.
I0320 23:03:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:03:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:03:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:03:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:03:53.409781 543705 memory.go:184] no items to output this cycle
I0320 23:03:53.409784 543705 cpu.go:275] no items to output this cycle
E0320 23:04:03.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:04:03.409763 543705 memory.go:184] no items to output this cycle
I0320 23:04:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 23:04:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:04:13.409805 543705 memory.go:191] Add success.
I0320 23:04:13.409808 543705 cpu.go:282] Add success.
W0320 23:04:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:04:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:04:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:04:13.420159 543705 net.go:648] Add success.
I0320 23:04:13.423175 543705 net.go:770] primary dev: ETH0
I0320 23:04:13.423188 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:04:13.423201 543705 net.go:698] Add success.
I0320 23:04:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:04:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:04:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0320 23:04:14.455179 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:04:14.456619 543705 disk_worker.go:494] system disk:vda1
I0320 23:04:14.456663 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:04:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:04:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:04:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:04:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:04:16.472389 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:04:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:04:23.409774 543705 memory.go:184] no items to output this cycle
I0320 23:04:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 23:04:26.565670 543705 disk_info.go:125] begin check local disk info of client
I0320 23:04:26.568418 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:04:26.568424 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007ae40 0xc00007ae80]
E0320 23:04:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:04:33.409770 543705 memory.go:184] no items to output this cycle
I0320 23:04:33.409779 543705 cpu.go:275] no items to output this cycle
E0320 23:04:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:04:43.409815 543705 memory.go:191] Add success.
I0320 23:04:43.409823 543705 cpu.go:282] Add success.
I0320 23:04:43.419972 543705 net.go:648] Add success.
I0320 23:04:43.423004 543705 net.go:770] primary dev: ETH0
I0320 23:04:43.423019 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:04:43.423034 543705 net.go:698] Add success.
I0320 23:04:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:04:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:04:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:04:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:04:53.409784 543705 memory.go:184] no items to output this cycle
I0320 23:04:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 23:05:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:05:03.409804 543705 memory.go:184] no items to output this cycle
I0320 23:05:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 23:05:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:05:13.409794 543705 memory.go:191] Add success.
I0320 23:05:13.409809 543705 cpu.go:282] Add success.
W0320 23:05:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:05:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:05:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:05:13.420227 543705 net.go:648] Add success.
I0320 23:05:13.422850 543705 net.go:770] primary dev: ETH0
I0320 23:05:13.422863 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:05:13.422876 543705 net.go:698] Add success.
I0320 23:05:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:05:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:05:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0320 23:05:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:05:14.456512 543705 disk_worker.go:494] system disk:vda1
I0320 23:05:14.456557 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:05:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:05:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:05:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:05:16.458079 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:05:16.472418 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:05:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:05:23.409804 543705 memory.go:184] no items to output this cycle
I0320 23:05:23.409815 543705 cpu.go:275] no items to output this cycle
I0320 23:05:26.569104 543705 disk_info.go:125] begin check local disk info of client
I0320 23:05:26.571634 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:05:26.571641 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5e80 0xc0000c5ec0]
E0320 23:05:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:05:33.409794 543705 memory.go:184] no items to output this cycle
I0320 23:05:33.409805 543705 cpu.go:275] no items to output this cycle
E0320 23:05:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:05:43.409825 543705 memory.go:191] Add success.
I0320 23:05:43.409828 543705 cpu.go:282] Add success.
I0320 23:05:43.419875 543705 net.go:648] Add success.
I0320 23:05:43.422792 543705 net.go:770] primary dev: ETH0
I0320 23:05:43.422804 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:05:43.422817 543705 net.go:698] Add success.
I0320 23:05:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:05:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:05:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:05:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:05:53.409773 543705 memory.go:184] no items to output this cycle
I0320 23:05:53.409809 543705 cpu.go:275] no items to output this cycle
E0320 23:06:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:06:03.409794 543705 memory.go:184] no items to output this cycle
I0320 23:06:03.409807 543705 cpu.go:275] no items to output this cycle
E0320 23:06:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:06:13.409804 543705 memory.go:191] Add success.
I0320 23:06:13.409820 543705 cpu.go:282] Add success.
W0320 23:06:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:06:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:06:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:06:13.420166 543705 net.go:648] Add success.
I0320 23:06:13.423227 543705 net.go:770] primary dev: ETH0
I0320 23:06:13.423242 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:06:13.423256 543705 net.go:698] Add success.
I0320 23:06:13.777487 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"75dfdbf0-6953-408d-a8b8-f34501662a05","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 23:06:13.777522 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 23:06:14.453971 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:06:14.455237 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:06:14.455309 543705 disk_worker.go:708] disk space is not compliant
W0320 23:06:14.455313 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:06:14.456882 543705 disk_worker.go:494] system disk:vda1
I0320 23:06:14.456914 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:06:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:06:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:06:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:06:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:06:16.472442 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:06:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:06:23.409779 543705 memory.go:184] no items to output this cycle
I0320 23:06:23.409782 543705 cpu.go:275] no items to output this cycle
I0320 23:06:26.572015 543705 disk_info.go:125] begin check local disk info of client
I0320 23:06:26.574746 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:06:26.574754 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4100 0xc0000c4140]
E0320 23:06:33.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:06:33.409763 543705 memory.go:184] no items to output this cycle
I0320 23:06:33.409796 543705 cpu.go:275] no items to output this cycle
I0320 23:06:38.653730 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 23:06:38.653736 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 23:06:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:06:43.410782 543705 memory.go:191] Add success.
I0320 23:06:43.409804 543705 cpu.go:282] Add success.
I0320 23:06:43.420473 543705 net.go:648] Add success.
I0320 23:06:43.423280 543705 net.go:770] primary dev: ETH0
I0320 23:06:43.423292 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:06:43.423305 543705 net.go:698] Add success.
I0320 23:06:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:06:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:06:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:06:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:06:53.409780 543705 memory.go:184] no items to output this cycle
I0320 23:06:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 23:07:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:07:03.409780 543705 memory.go:184] no items to output this cycle
I0320 23:07:03.409782 543705 cpu.go:275] no items to output this cycle
E0320 23:07:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:07:13.409797 543705 memory.go:191] Add success.
I0320 23:07:13.409797 543705 cpu.go:282] Add success.
W0320 23:07:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:07:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:07:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:07:13.420112 543705 net.go:648] Add success.
I0320 23:07:13.423359 543705 net.go:770] primary dev: ETH0
I0320 23:07:13.423373 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:07:13.423385 543705 net.go:698] Add success.
I0320 23:07:13.452859 543705 event_worker.go:152] Polling the log file for events...
W0320 23:07:14.455167 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:07:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0320 23:07:14.455180 543705 disk_worker.go:728] disk inode is not compliant
E0320 23:07:14.455915 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 23:07:14.455924 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 23:07:14.455930 543705 custom_config.go:64] query custom config with name: gpu
I0320 23:07:14.456552 543705 disk_worker.go:494] system disk:vda1
I0320 23:07:14.456583 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 23:07:15.456938 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 23:07:15.456951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:07:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 23:07:16.457991 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 23:07:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:07:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:07:16.472418 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:07:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:07:23.409783 543705 memory.go:184] no items to output this cycle
I0320 23:07:23.409783 543705 cpu.go:275] no items to output this cycle
I0320 23:07:26.575125 543705 disk_info.go:125] begin check local disk info of client
I0320 23:07:26.577866 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:07:26.577873 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5380 0xc0000c53c0]
E0320 23:07:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:07:33.409793 543705 memory.go:184] no items to output this cycle
I0320 23:07:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 23:07:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:07:43.409780 543705 memory.go:191] Add success.
I0320 23:07:43.409804 543705 cpu.go:282] Add success.
I0320 23:07:43.419853 543705 net.go:648] Add success.
I0320 23:07:43.422883 543705 net.go:770] primary dev: ETH0
I0320 23:07:43.422900 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:07:43.422912 543705 net.go:698] Add success.
I0320 23:07:46.458006 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:07:46.458075 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:07:46.458102 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:07:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:07:53.409776 543705 memory.go:184] no items to output this cycle
I0320 23:07:53.409796 543705 cpu.go:275] no items to output this cycle
E0320 23:08:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:08:03.409768 543705 memory.go:184] no items to output this cycle
I0320 23:08:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 23:08:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:08:13.409786 543705 memory.go:191] Add success.
W0320 23:08:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:08:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:08:13.409821 543705 cpu.go:282] Add success.
I0320 23:08:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:08:13.420159 543705 net.go:648] Add success.
I0320 23:08:13.422818 543705 net.go:770] primary dev: ETH0
I0320 23:08:13.422831 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:08:13.422842 543705 net.go:698] Add success.
I0320 23:08:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:08:14.455185 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:08:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 23:08:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:08:14.456595 543705 disk_worker.go:494] system disk:vda1
I0320 23:08:14.456626 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:08:15.455949 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:08:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:08:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:08:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:08:16.472398 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:08:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:08:23.409772 543705 memory.go:184] no items to output this cycle
I0320 23:08:23.409787 543705 cpu.go:275] no items to output this cycle
I0320 23:08:26.578726 543705 disk_info.go:125] begin check local disk info of client
I0320 23:08:26.581511 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:08:26.581517 543705 disk_info.go:196] parse disk info done, disk is : [0xc000366fc0 0xc000367000]
E0320 23:08:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:08:33.409776 543705 memory.go:184] no items to output this cycle
I0320 23:08:33.409783 543705 cpu.go:275] no items to output this cycle
E0320 23:08:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:08:43.409788 543705 cpu.go:282] Add success.
I0320 23:08:43.409795 543705 memory.go:191] Add success.
I0320 23:08:43.419877 543705 net.go:648] Add success.
I0320 23:08:43.422507 543705 net.go:770] primary dev: ETH0
I0320 23:08:43.422519 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:08:43.422533 543705 net.go:698] Add success.
I0320 23:08:46.457999 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:08:46.458073 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:08:46.458104 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:08:53.410458 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:08:53.410464 543705 cpu.go:275] no items to output this cycle
I0320 23:08:53.410477 543705 memory.go:184] no items to output this cycle
E0320 23:09:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:09:03.409777 543705 memory.go:184] no items to output this cycle
I0320 23:09:03.409788 543705 cpu.go:275] no items to output this cycle
E0320 23:09:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:09:13.409816 543705 memory.go:191] Add success.
I0320 23:09:13.409820 543705 cpu.go:282] Add success.
W0320 23:09:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:09:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:09:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:09:13.420135 543705 net.go:648] Add success.
I0320 23:09:13.422638 543705 net.go:770] primary dev: ETH0
I0320 23:09:13.422651 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:09:13.422662 543705 net.go:698] Add success.
I0320 23:09:13.469811 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"063a3a44-5b7a-4373-a901-7c790edcaa19","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 23:09:13.469844 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 23:09:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:09:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:09:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0320 23:09:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:09:14.456607 543705 disk_worker.go:494] system disk:vda1
I0320 23:09:14.456638 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:09:15.455649 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:09:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:09:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:09:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:09:16.472434 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:09:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:09:23.409797 543705 memory.go:184] no items to output this cycle
I0320 23:09:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 23:09:26.582213 543705 disk_info.go:125] begin check local disk info of client
I0320 23:09:26.584724 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:09:26.584730 543705 disk_info.go:196] parse disk info done, disk is : [0xc00056dbc0 0xc00056dc00]
E0320 23:09:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:09:33.409798 543705 memory.go:184] no items to output this cycle
I0320 23:09:33.409809 543705 cpu.go:275] no items to output this cycle
I0320 23:09:38.653875 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 23:09:38.653882 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 23:09:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:09:43.410651 543705 memory.go:191] Add success.
I0320 23:09:43.409781 543705 cpu.go:282] Add success.
I0320 23:09:43.420374 543705 net.go:648] Add success.
I0320 23:09:43.422969 543705 net.go:770] primary dev: ETH0
I0320 23:09:43.422982 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:09:43.422995 543705 net.go:698] Add success.
I0320 23:09:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:09:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:09:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:09:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:09:53.409805 543705 memory.go:184] no items to output this cycle
I0320 23:09:53.409817 543705 cpu.go:275] no items to output this cycle
E0320 23:10:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:10:03.409765 543705 memory.go:184] no items to output this cycle
I0320 23:10:03.409797 543705 cpu.go:275] no items to output this cycle
E0320 23:10:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:10:13.409806 543705 memory.go:191] Add success.
I0320 23:10:13.409807 543705 cpu.go:282] Add success.
W0320 23:10:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:10:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:10:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:10:13.420160 543705 net.go:648] Add success.
I0320 23:10:13.422898 543705 net.go:770] primary dev: ETH0
I0320 23:10:13.422913 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:10:13.422926 543705 net.go:698] Add success.
I0320 23:10:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:10:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:10:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 23:10:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:10:14.456588 543705 disk_worker.go:494] system disk:vda1
I0320 23:10:14.456629 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:10:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:10:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:10:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:10:16.458076 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:10:16.472417 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:10:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:10:23.409798 543705 memory.go:184] no items to output this cycle
I0320 23:10:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 23:10:26.585168 543705 disk_info.go:125] begin check local disk info of client
I0320 23:10:26.587665 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:10:26.587671 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa5c0 0xc0001fa600]
E0320 23:10:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:10:33.409804 543705 memory.go:184] no items to output this cycle
I0320 23:10:33.409814 543705 cpu.go:275] no items to output this cycle
E0320 23:10:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:10:43.409795 543705 memory.go:191] Add success.
I0320 23:10:43.409820 543705 cpu.go:282] Add success.
I0320 23:10:43.419989 543705 net.go:648] Add success.
I0320 23:10:43.422801 543705 net.go:770] primary dev: ETH0
I0320 23:10:43.422816 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:10:43.422830 543705 net.go:698] Add success.
I0320 23:10:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:10:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:10:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:10:53.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:10:53.409809 543705 memory.go:184] no items to output this cycle
I0320 23:10:53.409819 543705 cpu.go:275] no items to output this cycle
E0320 23:11:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:11:03.409785 543705 memory.go:184] no items to output this cycle
I0320 23:11:03.409809 543705 cpu.go:275] no items to output this cycle
E0320 23:11:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:11:13.409811 543705 memory.go:191] Add success.
I0320 23:11:13.409813 543705 cpu.go:282] Add success.
W0320 23:11:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:11:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:11:13.409855 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:11:13.420125 543705 net.go:648] Add success.
I0320 23:11:13.422687 543705 net.go:770] primary dev: ETH0
I0320 23:11:13.422699 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:11:13.422730 543705 net.go:698] Add success.
I0320 23:11:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:11:14.455162 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:11:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0320 23:11:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:11:14.456563 543705 disk_worker.go:494] system disk:vda1
I0320 23:11:14.456593 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:11:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:11:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:11:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:11:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:11:16.472552 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:11:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:11:23.409791 543705 memory.go:184] no items to output this cycle
I0320 23:11:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 23:11:26.588103 543705 disk_info.go:125] begin check local disk info of client
I0320 23:11:26.590613 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:11:26.590620 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004bc300 0xc0004bc340]
E0320 23:11:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:11:33.409781 543705 memory.go:184] no items to output this cycle
I0320 23:11:33.409786 543705 cpu.go:275] no items to output this cycle
E0320 23:11:43.409817 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:11:43.409860 543705 memory.go:191] Add success.
I0320 23:11:43.409874 543705 cpu.go:282] Add success.
I0320 23:11:43.420061 543705 net.go:648] Add success.
I0320 23:11:43.422994 543705 net.go:770] primary dev: ETH0
I0320 23:11:43.423007 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:11:43.423020 543705 net.go:698] Add success.
I0320 23:11:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:11:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:11:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:11:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:11:53.409768 543705 memory.go:184] no items to output this cycle
I0320 23:11:53.409810 543705 cpu.go:275] no items to output this cycle
E0320 23:12:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:12:03.409797 543705 memory.go:184] no items to output this cycle
I0320 23:12:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 23:12:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:12:13.409791 543705 memory.go:191] Add success.
W0320 23:12:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:12:13.412259 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:12:13.412264 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:12:13.409923 543705 cpu.go:282] Add success.
I0320 23:12:13.419937 543705 net.go:648] Add success.
I0320 23:12:13.421838 543705 net.go:770] primary dev: ETH0
I0320 23:12:13.421850 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:12:13.421863 543705 net.go:698] Add success.
I0320 23:12:13.468664 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b1ba4977-155b-4924-9961-17925447f00d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 23:12:13.468700 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 23:12:14.455128 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:12:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0320 23:12:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:12:14.456817 543705 disk_worker.go:494] system disk:vda1
I0320 23:12:14.456869 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 23:12:14.457149 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 23:12:14.457157 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 23:12:14.457162 543705 custom_config.go:64] query custom config with name: gpu
E0320 23:12:15.456801 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 23:12:15.456810 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:12:16.457954 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 23:12:16.457954 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 23:12:16.458010 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:12:16.458029 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:12:16.472444 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:12:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:12:23.409771 543705 memory.go:184] no items to output this cycle
I0320 23:12:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 23:12:26.591179 543705 disk_info.go:125] begin check local disk info of client
I0320 23:12:26.593630 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:12:26.593638 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b39c0 0xc0003b3a00]
E0320 23:12:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:12:33.409792 543705 memory.go:184] no items to output this cycle
I0320 23:12:33.409807 543705 cpu.go:275] no items to output this cycle
I0320 23:12:38.654973 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 23:12:38.654979 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 23:12:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:12:43.410601 543705 memory.go:191] Add success.
I0320 23:12:43.409824 543705 cpu.go:282] Add success.
I0320 23:12:43.420354 543705 net.go:648] Add success.
I0320 23:12:43.422801 543705 net.go:770] primary dev: ETH0
I0320 23:12:43.422814 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:12:43.422827 543705 net.go:698] Add success.
I0320 23:12:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:12:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:12:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:12:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:12:53.409770 543705 memory.go:184] no items to output this cycle
I0320 23:12:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 23:13:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:13:03.409775 543705 memory.go:184] no items to output this cycle
I0320 23:13:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 23:13:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:13:13.409826 543705 memory.go:191] Add success.
I0320 23:13:13.409832 543705 cpu.go:282] Add success.
W0320 23:13:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:13:13.409876 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:13:13.409880 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:13:13.420164 543705 net.go:648] Add success.
I0320 23:13:13.423075 543705 net.go:770] primary dev: ETH0
I0320 23:13:13.423106 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:13:13.423119 543705 net.go:698] Add success.
I0320 23:13:14.454488 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:13:14.454643 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:13:14.454719 543705 disk_worker.go:708] disk space is not compliant
W0320 23:13:14.454722 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:13:14.456338 543705 disk_worker.go:494] system disk:vda1
I0320 23:13:14.456368 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:13:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:13:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:13:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:13:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:13:16.472522 543705 disk_local_worker.go:436] Get disk info: []
I0320 23:13:23.409916 543705 cpu.go:275] no items to output this cycle
E0320 23:13:23.410090 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:13:23.410102 543705 memory.go:184] no items to output this cycle
I0320 23:13:26.594726 543705 disk_info.go:125] begin check local disk info of client
I0320 23:13:26.597248 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:13:26.597262 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a5c40 0xc0002a5c80]
E0320 23:13:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:13:33.409776 543705 memory.go:184] no items to output this cycle
I0320 23:13:33.409801 543705 cpu.go:275] no items to output this cycle
E0320 23:13:43.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:13:43.409826 543705 memory.go:191] Add success.
I0320 23:13:43.409831 543705 cpu.go:282] Add success.
I0320 23:13:43.420083 543705 net.go:648] Add success.
I0320 23:13:43.422728 543705 net.go:770] primary dev: ETH0
I0320 23:13:43.422744 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:13:43.422759 543705 net.go:698] Add success.
I0320 23:13:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:13:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:13:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:13:53.410401 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:13:53.410418 543705 memory.go:184] no items to output this cycle
I0320 23:13:53.410437 543705 cpu.go:275] no items to output this cycle
E0320 23:14:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:14:03.409795 543705 memory.go:184] no items to output this cycle
I0320 23:14:03.409806 543705 cpu.go:275] no items to output this cycle
E0320 23:14:13.409814 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:14:13.409825 543705 cpu.go:282] Add success.
I0320 23:14:13.409847 543705 memory.go:191] Add success.
W0320 23:14:13.409884 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:14:13.409906 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:14:13.409911 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:14:13.420254 543705 net.go:648] Add success.
I0320 23:14:13.423035 543705 net.go:770] primary dev: ETH0
I0320 23:14:13.423051 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:14:13.423082 543705 net.go:698] Add success.
I0320 23:14:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:14:14.455207 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:14:14.455220 543705 disk_worker.go:708] disk space is not compliant
W0320 23:14:14.455223 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:14:14.456633 543705 disk_worker.go:494] system disk:vda1
I0320 23:14:14.456669 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:14:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:14:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:14:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:14:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:14:16.472394 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:14:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:14:23.409774 543705 memory.go:184] no items to output this cycle
I0320 23:14:23.409792 543705 cpu.go:275] no items to output this cycle
I0320 23:14:26.598238 543705 disk_info.go:125] begin check local disk info of client
I0320 23:14:26.600968 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:14:26.600976 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004c97c0 0xc0004c9800]
E0320 23:14:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:14:33.409766 543705 memory.go:184] no items to output this cycle
I0320 23:14:33.409790 543705 cpu.go:275] no items to output this cycle
E0320 23:14:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:14:43.409806 543705 memory.go:191] Add success.
I0320 23:14:43.409807 543705 cpu.go:282] Add success.
I0320 23:14:43.420075 543705 net.go:648] Add success.
I0320 23:14:43.422690 543705 net.go:770] primary dev: ETH0
I0320 23:14:43.422705 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:14:43.422720 543705 net.go:698] Add success.
I0320 23:14:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:14:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:14:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:14:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:14:53.409779 543705 memory.go:184] no items to output this cycle
I0320 23:14:53.409783 543705 cpu.go:275] no items to output this cycle
E0320 23:15:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:15:03.409774 543705 memory.go:184] no items to output this cycle
I0320 23:15:03.409778 543705 cpu.go:275] no items to output this cycle
E0320 23:15:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:15:13.409829 543705 memory.go:191] Add success.
I0320 23:15:13.409844 543705 cpu.go:282] Add success.
W0320 23:15:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:15:13.409879 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:15:13.409883 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:15:13.420237 543705 net.go:648] Add success.
I0320 23:15:13.423317 543705 net.go:770] primary dev: ETH0
I0320 23:15:13.423331 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:15:13.423343 543705 net.go:698] Add success.
I0320 23:15:13.468095 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a682af4a-5a05-4ef7-9d91-fdedd2928dc7","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 23:15:13.468130 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 23:15:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:15:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:15:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0320 23:15:14.455216 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:15:14.456720 543705 disk_worker.go:494] system disk:vda1
I0320 23:15:14.456749 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:15:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:15:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:15:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:15:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:15:16.472415 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:15:23.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:15:23.409811 543705 memory.go:184] no items to output this cycle
I0320 23:15:23.409819 543705 cpu.go:275] no items to output this cycle
I0320 23:15:26.601215 543705 disk_info.go:125] begin check local disk info of client
I0320 23:15:26.603835 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:15:26.603843 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032ecc0 0xc00032ed00]
E0320 23:15:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:15:33.409781 543705 memory.go:184] no items to output this cycle
I0320 23:15:33.409799 543705 cpu.go:275] no items to output this cycle
I0320 23:15:38.655974 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 23:15:38.655980 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 23:15:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:15:43.410601 543705 memory.go:191] Add success.
I0320 23:15:43.409800 543705 cpu.go:282] Add success.
I0320 23:15:43.420303 543705 net.go:648] Add success.
I0320 23:15:43.422749 543705 net.go:770] primary dev: ETH0
I0320 23:15:43.422762 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:15:43.422774 543705 net.go:698] Add success.
I0320 23:15:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:15:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:15:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:15:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:15:53.409771 543705 memory.go:184] no items to output this cycle
I0320 23:15:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 23:16:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:16:03.409778 543705 memory.go:184] no items to output this cycle
I0320 23:16:03.409781 543705 cpu.go:275] no items to output this cycle
E0320 23:16:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:16:13.409818 543705 memory.go:191] Add success.
I0320 23:16:13.409829 543705 cpu.go:282] Add success.
W0320 23:16:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:16:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:16:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:16:13.420373 543705 net.go:648] Add success.
I0320 23:16:13.423613 543705 net.go:770] primary dev: ETH0
I0320 23:16:13.423629 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:16:13.423643 543705 net.go:698] Add success.
I0320 23:16:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:16:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:16:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0320 23:16:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:16:14.456550 543705 disk_worker.go:494] system disk:vda1
I0320 23:16:14.456597 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:16:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:16:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:16:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:16:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:16:16.472408 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:16:23.410639 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:16:23.410655 543705 memory.go:184] no items to output this cycle
I0320 23:16:23.410663 543705 cpu.go:275] no items to output this cycle
I0320 23:16:26.604277 543705 disk_info.go:125] begin check local disk info of client
I0320 23:16:26.606776 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:16:26.606783 543705 disk_info.go:196] parse disk info done, disk is : [0xc000566440 0xc000566480]
E0320 23:16:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:16:33.409804 543705 memory.go:184] no items to output this cycle
I0320 23:16:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 23:16:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:16:43.409793 543705 memory.go:191] Add success.
I0320 23:16:43.409814 543705 cpu.go:282] Add success.
I0320 23:16:43.419968 543705 net.go:648] Add success.
I0320 23:16:43.422859 543705 net.go:770] primary dev: ETH0
I0320 23:16:43.422871 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:16:43.422884 543705 net.go:698] Add success.
I0320 23:16:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:16:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:16:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:16:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:16:53.409777 543705 memory.go:184] no items to output this cycle
I0320 23:16:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 23:17:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:17:03.409786 543705 memory.go:184] no items to output this cycle
I0320 23:17:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 23:17:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:17:13.409819 543705 memory.go:191] Add success.
I0320 23:17:13.409822 543705 cpu.go:282] Add success.
W0320 23:17:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:17:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:17:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:17:13.420147 543705 net.go:648] Add success.
I0320 23:17:13.423160 543705 net.go:770] primary dev: ETH0
I0320 23:17:13.423173 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:17:13.423184 543705 net.go:698] Add success.
I0320 23:17:13.453752 543705 event_worker.go:152] Polling the log file for events...
W0320 23:17:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:17:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 23:17:14.455187 543705 disk_worker.go:728] disk inode is not compliant
E0320 23:17:14.455904 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 23:17:14.455912 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 23:17:14.455918 543705 custom_config.go:64] query custom config with name: gpu
I0320 23:17:14.456638 543705 disk_worker.go:494] system disk:vda1
I0320 23:17:14.456682 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 23:17:15.456834 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 23:17:15.456843 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:17:16.458037 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 23:17:16.458047 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 23:17:16.458097 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:17:16.458114 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:17:16.472479 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:17:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:17:23.409795 543705 memory.go:184] no items to output this cycle
I0320 23:17:23.409809 543705 cpu.go:275] no items to output this cycle
I0320 23:17:26.606857 543705 disk_info.go:125] begin check local disk info of client
I0320 23:17:26.609637 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:17:26.609655 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a7400 0xc0004a7440]
E0320 23:17:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:17:33.409775 543705 memory.go:184] no items to output this cycle
I0320 23:17:33.409790 543705 cpu.go:275] no items to output this cycle
E0320 23:17:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:17:43.409791 543705 memory.go:191] Add success.
I0320 23:17:43.409821 543705 cpu.go:282] Add success.
I0320 23:17:43.419907 543705 net.go:648] Add success.
I0320 23:17:43.422586 543705 net.go:770] primary dev: ETH0
I0320 23:17:43.422600 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:17:43.422614 543705 net.go:698] Add success.
I0320 23:17:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:17:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:17:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:17:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:17:53.409770 543705 memory.go:184] no items to output this cycle
I0320 23:17:53.409782 543705 cpu.go:275] no items to output this cycle
E0320 23:18:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:18:03.409798 543705 memory.go:184] no items to output this cycle
I0320 23:18:03.409811 543705 cpu.go:275] no items to output this cycle
E0320 23:18:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:18:13.409801 543705 memory.go:191] Add success.
W0320 23:18:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:18:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:18:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:18:13.409888 543705 cpu.go:282] Add success.
I0320 23:18:13.420526 543705 net.go:648] Add success.
I0320 23:18:13.423109 543705 net.go:770] primary dev: ETH0
I0320 23:18:13.423127 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:18:13.423148 543705 net.go:698] Add success.
I0320 23:18:13.468437 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"823884a6-95a6-4de4-b954-5655f712c0d7","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 23:18:13.468473 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 23:18:14.454989 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:18:14.455140 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:18:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0320 23:18:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:18:14.456583 543705 disk_worker.go:494] system disk:vda1
I0320 23:18:14.456644 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:18:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:18:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:18:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:18:16.458078 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:18:16.472408 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:18:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:18:23.409799 543705 memory.go:184] no items to output this cycle
I0320 23:18:23.409810 543705 cpu.go:275] no items to output this cycle
I0320 23:18:26.610731 543705 disk_info.go:125] begin check local disk info of client
I0320 23:18:26.613199 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:18:26.613205 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003fee40 0xc0003fee80]
E0320 23:18:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:18:33.409781 543705 memory.go:184] no items to output this cycle
I0320 23:18:33.409786 543705 cpu.go:275] no items to output this cycle
I0320 23:18:38.656982 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 23:18:38.656990 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 23:18:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:18:43.410664 543705 memory.go:191] Add success.
I0320 23:18:43.409814 543705 cpu.go:282] Add success.
I0320 23:18:43.420484 543705 net.go:648] Add success.
I0320 23:18:43.422994 543705 net.go:770] primary dev: ETH0
I0320 23:18:43.423009 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:18:43.423024 543705 net.go:698] Add success.
I0320 23:18:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:18:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:18:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:18:53.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:18:53.409787 543705 memory.go:184] no items to output this cycle
I0320 23:18:53.409789 543705 cpu.go:275] no items to output this cycle
E0320 23:19:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:19:03.409797 543705 memory.go:184] no items to output this cycle
I0320 23:19:03.409808 543705 cpu.go:275] no items to output this cycle
E0320 23:19:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:19:13.409786 543705 memory.go:191] Add success.
I0320 23:19:13.409808 543705 cpu.go:282] Add success.
W0320 23:19:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:19:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:19:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:19:13.420064 543705 net.go:648] Add success.
I0320 23:19:13.423372 543705 net.go:770] primary dev: ETH0
I0320 23:19:13.423384 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:19:13.423395 543705 net.go:698] Add success.
I0320 23:19:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:19:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:19:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0320 23:19:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:19:14.456568 543705 disk_worker.go:494] system disk:vda1
I0320 23:19:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:19:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:19:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:19:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:19:16.458078 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:19:16.472428 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:19:23.410362 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:19:23.410378 543705 memory.go:184] no items to output this cycle
I0320 23:19:23.410394 543705 cpu.go:275] no items to output this cycle
I0320 23:19:26.613300 543705 disk_info.go:125] begin check local disk info of client
I0320 23:19:26.615818 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:19:26.615825 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003be480 0xc0003be4c0]
E0320 23:19:33.409879 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:19:33.409910 543705 memory.go:184] no items to output this cycle
I0320 23:19:33.409910 543705 cpu.go:275] no items to output this cycle
E0320 23:19:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:19:43.409824 543705 memory.go:191] Add success.
I0320 23:19:43.409833 543705 cpu.go:282] Add success.
I0320 23:19:43.420010 543705 net.go:648] Add success.
I0320 23:19:43.422597 543705 net.go:770] primary dev: ETH0
I0320 23:19:43.422610 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:19:43.422623 543705 net.go:698] Add success.
I0320 23:19:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:19:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:19:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:19:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:19:53.409773 543705 memory.go:184] no items to output this cycle
I0320 23:19:53.409779 543705 cpu.go:275] no items to output this cycle
E0320 23:20:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:20:03.409784 543705 memory.go:184] no items to output this cycle
I0320 23:20:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 23:20:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:20:13.409791 543705 memory.go:191] Add success.
I0320 23:20:13.409795 543705 cpu.go:282] Add success.
W0320 23:20:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:20:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:20:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:20:13.420080 543705 net.go:648] Add success.
I0320 23:20:13.422714 543705 net.go:770] primary dev: ETH0
I0320 23:20:13.422727 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:20:13.422739 543705 net.go:698] Add success.
I0320 23:20:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:20:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:20:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 23:20:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:20:14.456568 543705 disk_worker.go:494] system disk:vda1
I0320 23:20:14.456616 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:20:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:20:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:20:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:20:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:20:16.472395 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:20:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:20:23.409782 543705 memory.go:184] no items to output this cycle
I0320 23:20:23.409784 543705 cpu.go:275] no items to output this cycle
I0320 23:20:26.616256 543705 disk_info.go:125] begin check local disk info of client
I0320 23:20:26.618765 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:20:26.618772 543705 disk_info.go:196] parse disk info done, disk is : [0xc00047b9c0 0xc00047ba00]
E0320 23:20:33.409873 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:20:33.410032 543705 memory.go:184] no items to output this cycle
I0320 23:20:33.409892 543705 cpu.go:275] no items to output this cycle
E0320 23:20:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:20:43.409798 543705 memory.go:191] Add success.
I0320 23:20:43.409809 543705 cpu.go:282] Add success.
I0320 23:20:43.419975 543705 net.go:648] Add success.
I0320 23:20:43.422468 543705 net.go:770] primary dev: ETH0
I0320 23:20:43.422483 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:20:43.422498 543705 net.go:698] Add success.
I0320 23:20:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:20:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:20:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:20:53.410372 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:20:53.410388 543705 memory.go:184] no items to output this cycle
I0320 23:20:53.410428 543705 cpu.go:275] no items to output this cycle
E0320 23:21:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:21:03.409782 543705 memory.go:184] no items to output this cycle
I0320 23:21:03.409786 543705 cpu.go:275] no items to output this cycle
E0320 23:21:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:21:13.409827 543705 memory.go:191] Add success.
I0320 23:21:13.409839 543705 cpu.go:282] Add success.
W0320 23:21:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:21:13.409879 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:21:13.409883 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:21:13.420164 543705 net.go:648] Add success.
I0320 23:21:13.422916 543705 net.go:770] primary dev: ETH0
I0320 23:21:13.422929 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:21:13.422941 543705 net.go:698] Add success.
I0320 23:21:13.629296 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"49bbb84d-637f-40de-9e8c-71cb36bc1c57","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 23:21:13.629330 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 23:21:14.454954 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:21:14.455191 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:21:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0320 23:21:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:21:14.456604 543705 disk_worker.go:494] system disk:vda1
I0320 23:21:14.456633 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:21:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:21:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:21:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:21:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:21:16.472464 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:21:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:21:23.409779 543705 memory.go:184] no items to output this cycle
I0320 23:21:23.409796 543705 cpu.go:275] no items to output this cycle
I0320 23:21:26.619368 543705 disk_info.go:125] begin check local disk info of client
I0320 23:21:26.621877 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:21:26.621883 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002aa580 0xc0002aa5c0]
E0320 23:21:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:21:33.409893 543705 memory.go:184] no items to output this cycle
I0320 23:21:33.409896 543705 cpu.go:275] no items to output this cycle
I0320 23:21:38.657734 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 23:21:38.657742 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 23:21:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:21:43.410577 543705 memory.go:191] Add success.
I0320 23:21:43.409797 543705 cpu.go:282] Add success.
I0320 23:21:43.420299 543705 net.go:648] Add success.
I0320 23:21:43.422921 543705 net.go:770] primary dev: ETH0
I0320 23:21:43.422935 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:21:43.422947 543705 net.go:698] Add success.
I0320 23:21:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:21:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:21:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:21:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:21:53.409773 543705 memory.go:184] no items to output this cycle
I0320 23:21:53.409790 543705 cpu.go:275] no items to output this cycle
E0320 23:22:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:22:03.409797 543705 memory.go:184] no items to output this cycle
I0320 23:22:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 23:22:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:22:13.409794 543705 memory.go:191] Add success.
W0320 23:22:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0320 23:22:13.409824 543705 cpu.go:282] Add success.
W0320 23:22:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:22:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:22:13.420193 543705 net.go:648] Add success.
I0320 23:22:13.422825 543705 net.go:770] primary dev: ETH0
I0320 23:22:13.422840 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:22:13.422855 543705 net.go:698] Add success.
W0320 23:22:14.455145 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:22:14.455222 543705 disk_worker.go:708] disk space is not compliant
W0320 23:22:14.455226 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:22:14.456881 543705 disk_worker.go:494] system disk:vda1
I0320 23:22:14.456923 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 23:22:14.457345 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 23:22:14.457354 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 23:22:14.457359 543705 custom_config.go:64] query custom config with name: gpu
E0320 23:22:15.456808 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 23:22:15.456818 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:22:16.457934 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 23:22:16.457934 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 23:22:16.457987 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:22:16.458008 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:22:16.472339 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:22:23.410375 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:22:23.410389 543705 memory.go:184] no items to output this cycle
I0320 23:22:23.410400 543705 cpu.go:275] no items to output this cycle
I0320 23:22:26.622728 543705 disk_info.go:125] begin check local disk info of client
I0320 23:22:26.625160 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:22:26.625167 543705 disk_info.go:196] parse disk info done, disk is : [0xc00049e280 0xc00049e2c0]
E0320 23:22:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:22:33.409803 543705 memory.go:184] no items to output this cycle
I0320 23:22:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 23:22:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:22:43.409822 543705 memory.go:191] Add success.
I0320 23:22:43.409825 543705 cpu.go:282] Add success.
I0320 23:22:43.419964 543705 net.go:648] Add success.
I0320 23:22:43.422540 543705 net.go:770] primary dev: ETH0
I0320 23:22:43.422554 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:22:43.422568 543705 net.go:698] Add success.
I0320 23:22:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:22:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:22:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:22:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:22:53.409778 543705 cpu.go:275] no items to output this cycle
I0320 23:22:53.409779 543705 memory.go:184] no items to output this cycle
E0320 23:23:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:23:03.409794 543705 memory.go:184] no items to output this cycle
I0320 23:23:03.409808 543705 cpu.go:275] no items to output this cycle
E0320 23:23:13.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:23:13.409826 543705 memory.go:191] Add success.
I0320 23:23:13.409830 543705 cpu.go:282] Add success.
W0320 23:23:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:23:13.409875 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:23:13.409879 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:23:13.420145 543705 net.go:648] Add success.
I0320 23:23:13.422818 543705 net.go:770] primary dev: ETH0
I0320 23:23:13.422833 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:23:13.422847 543705 net.go:698] Add success.
I0320 23:23:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:23:14.455116 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:23:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0320 23:23:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:23:14.456515 543705 disk_worker.go:494] system disk:vda1
I0320 23:23:14.456559 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:23:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:23:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:23:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:23:16.458076 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:23:16.472430 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:23:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:23:23.409768 543705 memory.go:184] no items to output this cycle
I0320 23:23:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 23:23:26.625356 543705 disk_info.go:125] begin check local disk info of client
I0320 23:23:26.627860 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:23:26.627867 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037c340 0xc00037c380]
E0320 23:23:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:23:33.409800 543705 memory.go:184] no items to output this cycle
I0320 23:23:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 23:23:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:23:43.409790 543705 memory.go:191] Add success.
I0320 23:23:43.409814 543705 cpu.go:282] Add success.
I0320 23:23:43.420110 543705 net.go:648] Add success.
I0320 23:23:43.422973 543705 net.go:770] primary dev: ETH0
I0320 23:23:43.422985 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:23:43.422997 543705 net.go:698] Add success.
I0320 23:23:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:23:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:23:46.458056 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:23:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:23:53.409777 543705 memory.go:184] no items to output this cycle
I0320 23:23:53.409781 543705 cpu.go:275] no items to output this cycle
E0320 23:24:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:24:03.409782 543705 cpu.go:275] no items to output this cycle
I0320 23:24:03.409792 543705 memory.go:184] no items to output this cycle
E0320 23:24:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:24:13.409789 543705 memory.go:191] Add success.
I0320 23:24:13.409813 543705 cpu.go:282] Add success.
W0320 23:24:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:24:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:24:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:24:13.420103 543705 net.go:648] Add success.
I0320 23:24:13.422788 543705 net.go:770] primary dev: ETH0
I0320 23:24:13.422802 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:24:13.422814 543705 net.go:698] Add success.
I0320 23:24:13.469336 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c76ca5e4-6f0e-404b-870e-518c81e73d9b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 23:24:13.469369 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 23:24:14.454987 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:24:14.455183 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:24:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0320 23:24:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:24:14.456597 543705 disk_worker.go:494] system disk:vda1
I0320 23:24:14.456643 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:24:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:24:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:24:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:24:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:24:16.472383 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:24:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:24:23.409776 543705 cpu.go:275] no items to output this cycle
I0320 23:24:23.409776 543705 memory.go:184] no items to output this cycle
I0320 23:24:26.628286 543705 disk_info.go:125] begin check local disk info of client
I0320 23:24:26.630772 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:24:26.630778 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab8c0 0xc0001ab900]
E0320 23:24:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:24:33.409780 543705 memory.go:184] no items to output this cycle
I0320 23:24:33.409783 543705 cpu.go:275] no items to output this cycle
I0320 23:24:38.659003 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 23:24:38.659010 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 23:24:43.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:24:43.410540 543705 memory.go:191] Add success.
I0320 23:24:43.409806 543705 cpu.go:282] Add success.
I0320 23:24:43.420291 543705 net.go:648] Add success.
I0320 23:24:43.422808 543705 net.go:770] primary dev: ETH0
I0320 23:24:43.422823 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:24:43.422836 543705 net.go:698] Add success.
I0320 23:24:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:24:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:24:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:24:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:24:53.409793 543705 memory.go:184] no items to output this cycle
I0320 23:24:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 23:25:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:25:03.409779 543705 memory.go:184] no items to output this cycle
I0320 23:25:03.409780 543705 cpu.go:275] no items to output this cycle
W0320 23:25:13.409709 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:25:13.409729 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:25:13.409733 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0320 23:25:13.409811 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:25:13.409827 543705 memory.go:191] Add success.
I0320 23:25:13.409826 543705 cpu.go:282] Add success.
I0320 23:25:13.420073 543705 net.go:648] Add success.
I0320 23:25:13.423271 543705 net.go:770] primary dev: ETH0
I0320 23:25:13.423285 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:25:13.423297 543705 net.go:698] Add success.
I0320 23:25:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:25:14.455152 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:25:14.455163 543705 disk_worker.go:708] disk space is not compliant
W0320 23:25:14.455165 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:25:14.456485 543705 disk_worker.go:494] system disk:vda1
I0320 23:25:14.456527 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:25:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:25:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:25:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:25:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:25:16.472414 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:25:23.410624 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:25:23.410639 543705 memory.go:184] no items to output this cycle
I0320 23:25:23.410642 543705 cpu.go:275] no items to output this cycle
I0320 23:25:26.631382 543705 disk_info.go:125] begin check local disk info of client
I0320 23:25:26.633903 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:25:26.633909 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bae00 0xc0002bae40]
E0320 23:25:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:25:33.409895 543705 memory.go:184] no items to output this cycle
I0320 23:25:33.409931 543705 cpu.go:275] no items to output this cycle
E0320 23:25:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:25:43.409792 543705 memory.go:191] Add success.
I0320 23:25:43.409792 543705 cpu.go:282] Add success.
I0320 23:25:43.419926 543705 net.go:648] Add success.
I0320 23:25:43.422730 543705 net.go:770] primary dev: ETH0
I0320 23:25:43.422744 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:25:43.422759 543705 net.go:698] Add success.
I0320 23:25:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:25:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:25:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:25:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:25:53.409782 543705 memory.go:184] no items to output this cycle
I0320 23:25:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 23:26:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:26:03.409788 543705 memory.go:184] no items to output this cycle
I0320 23:26:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 23:26:13.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:26:13.409835 543705 memory.go:191] Add success.
I0320 23:26:13.409836 543705 cpu.go:282] Add success.
W0320 23:26:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:26:13.409878 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:26:13.409881 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:26:13.420174 543705 net.go:648] Add success.
I0320 23:26:13.422876 543705 net.go:770] primary dev: ETH0
I0320 23:26:13.422891 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:26:13.422906 543705 net.go:698] Add success.
I0320 23:26:14.453932 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:26:14.455143 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:26:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0320 23:26:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:26:14.456577 543705 disk_worker.go:494] system disk:vda1
I0320 23:26:14.456627 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:26:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:26:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:26:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:26:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:26:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:26:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:26:23.409797 543705 memory.go:184] no items to output this cycle
I0320 23:26:23.409800 543705 cpu.go:275] no items to output this cycle
I0320 23:26:26.634730 543705 disk_info.go:125] begin check local disk info of client
I0320 23:26:26.637200 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:26:26.637213 543705 disk_info.go:196] parse disk info done, disk is : [0xc000521f40 0xc000540000]
E0320 23:26:33.409899 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:26:33.409918 543705 memory.go:184] no items to output this cycle
I0320 23:26:33.410111 543705 cpu.go:275] no items to output this cycle
E0320 23:26:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:26:43.409801 543705 memory.go:191] Add success.
I0320 23:26:43.409827 543705 cpu.go:282] Add success.
I0320 23:26:43.420025 543705 net.go:648] Add success.
I0320 23:26:43.422558 543705 net.go:770] primary dev: ETH0
I0320 23:26:43.422572 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:26:43.422584 543705 net.go:698] Add success.
I0320 23:26:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:26:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:26:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:26:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:26:53.409779 543705 memory.go:184] no items to output this cycle
I0320 23:26:53.409814 543705 cpu.go:275] no items to output this cycle
E0320 23:27:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:27:03.409774 543705 memory.go:184] no items to output this cycle
I0320 23:27:03.409808 543705 cpu.go:275] no items to output this cycle
E0320 23:27:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:27:13.409793 543705 memory.go:191] Add success.
I0320 23:27:13.409821 543705 cpu.go:282] Add success.
W0320 23:27:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:27:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:27:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:27:13.420113 543705 net.go:648] Add success.
I0320 23:27:13.428927 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 23:27:13.429006 543705 net.go:770] primary dev: ETH0
I0320 23:27:13.429020 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:27:13.429035 543705 net.go:698] Add success.
I0320 23:27:13.453625 543705 event_worker.go:152] Polling the log file for events...
I0320 23:27:13.469040 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1c729e02-6a4f-45e8-8126-172fd0fcfd2d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 23:27:13.469071 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 23:27:14.455163 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:27:14.455254 543705 disk_worker.go:708] disk space is not compliant
W0320 23:27:14.455258 543705 disk_worker.go:728] disk inode is not compliant
E0320 23:27:14.455860 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 23:27:14.455869 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 23:27:14.455874 543705 custom_config.go:64] query custom config with name: gpu
I0320 23:27:14.456777 543705 disk_worker.go:494] system disk:vda1
I0320 23:27:14.456831 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 23:27:15.456843 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 23:27:15.456852 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:27:16.457999 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 23:27:16.457999 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 23:27:16.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:27:16.458079 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:27:16.472448 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:27:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:27:23.409784 543705 memory.go:184] no items to output this cycle
I0320 23:27:23.409785 543705 cpu.go:275] no items to output this cycle
I0320 23:27:26.637293 543705 disk_info.go:125] begin check local disk info of client
I0320 23:27:26.639804 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:27:26.639811 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004927c0 0xc000492800]
E0320 23:27:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:27:33.409802 543705 memory.go:184] no items to output this cycle
I0320 23:27:33.409811 543705 cpu.go:275] no items to output this cycle
I0320 23:27:38.659152 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 23:27:38.659159 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 23:27:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:27:43.410667 543705 memory.go:191] Add success.
I0320 23:27:43.409818 543705 cpu.go:282] Add success.
I0320 23:27:43.420461 543705 net.go:648] Add success.
I0320 23:27:43.423485 543705 net.go:770] primary dev: ETH0
I0320 23:27:43.423500 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:27:43.423515 543705 net.go:698] Add success.
I0320 23:27:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:27:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:27:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:27:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:27:53.409797 543705 memory.go:184] no items to output this cycle
I0320 23:27:53.409807 543705 cpu.go:275] no items to output this cycle
E0320 23:28:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:28:03.409781 543705 memory.go:184] no items to output this cycle
I0320 23:28:03.409789 543705 cpu.go:275] no items to output this cycle
E0320 23:28:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:28:13.409795 543705 memory.go:191] Add success.
I0320 23:28:13.409798 543705 cpu.go:282] Add success.
W0320 23:28:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:28:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:28:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:28:13.420215 543705 net.go:648] Add success.
I0320 23:28:13.423375 543705 net.go:770] primary dev: ETH0
I0320 23:28:13.423389 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:28:13.423402 543705 net.go:698] Add success.
I0320 23:28:14.454991 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:28:14.455187 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:28:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0320 23:28:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:28:14.456551 543705 disk_worker.go:494] system disk:vda1
I0320 23:28:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:28:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:28:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:28:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:28:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:28:16.472407 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:28:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:28:23.409780 543705 memory.go:184] no items to output this cycle
I0320 23:28:23.409795 543705 cpu.go:275] no items to output this cycle
I0320 23:28:26.640351 543705 disk_info.go:125] begin check local disk info of client
I0320 23:28:26.642815 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:28:26.642822 543705 disk_info.go:196] parse disk info done, disk is : [0xc00056c000 0xc00056c040]
E0320 23:28:33.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:28:33.409824 543705 memory.go:184] no items to output this cycle
I0320 23:28:33.409847 543705 cpu.go:275] no items to output this cycle
E0320 23:28:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:28:43.409791 543705 memory.go:191] Add success.
I0320 23:28:43.409818 543705 cpu.go:282] Add success.
I0320 23:28:43.419899 543705 net.go:648] Add success.
I0320 23:28:43.422447 543705 net.go:770] primary dev: ETH0
I0320 23:28:43.422461 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:28:43.422473 543705 net.go:698] Add success.
I0320 23:28:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:28:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:28:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:28:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:28:53.409784 543705 memory.go:184] no items to output this cycle
I0320 23:28:53.409788 543705 cpu.go:275] no items to output this cycle
E0320 23:29:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:29:03.409774 543705 memory.go:184] no items to output this cycle
I0320 23:29:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 23:29:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:29:13.409800 543705 memory.go:191] Add success.
I0320 23:29:13.409805 543705 cpu.go:282] Add success.
W0320 23:29:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:29:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:29:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:29:13.420069 543705 net.go:648] Add success.
I0320 23:29:13.422861 543705 net.go:770] primary dev: ETH0
I0320 23:29:13.422877 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:29:13.422892 543705 net.go:698] Add success.
I0320 23:29:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:29:14.455131 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:29:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0320 23:29:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:29:14.456584 543705 disk_worker.go:494] system disk:vda1
I0320 23:29:14.456616 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:29:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:29:16.458012 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:29:16.458090 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:29:16.458120 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:29:16.472505 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:29:23.410298 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:29:23.410316 543705 memory.go:184] no items to output this cycle
I0320 23:29:23.410360 543705 cpu.go:275] no items to output this cycle
I0320 23:29:26.643432 543705 disk_info.go:125] begin check local disk info of client
I0320 23:29:26.645961 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:29:26.645969 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007a000 0xc00007a700]
E0320 23:29:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:29:33.409786 543705 memory.go:184] no items to output this cycle
I0320 23:29:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 23:29:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:29:43.409787 543705 memory.go:191] Add success.
I0320 23:29:43.409789 543705 cpu.go:282] Add success.
I0320 23:29:43.419968 543705 net.go:648] Add success.
I0320 23:29:43.422694 543705 net.go:770] primary dev: ETH0
I0320 23:29:43.422707 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:29:43.422720 543705 net.go:698] Add success.
I0320 23:29:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:29:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:29:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:29:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:29:53.409803 543705 memory.go:184] no items to output this cycle
I0320 23:29:53.409815 543705 cpu.go:275] no items to output this cycle
E0320 23:30:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:30:03.409800 543705 memory.go:184] no items to output this cycle
I0320 23:30:03.409818 543705 cpu.go:275] no items to output this cycle
E0320 23:30:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:30:13.409790 543705 memory.go:191] Add success.
I0320 23:30:13.409810 543705 cpu.go:282] Add success.
W0320 23:30:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:30:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:30:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:30:13.420212 543705 net.go:648] Add success.
I0320 23:30:13.422673 543705 net.go:770] primary dev: ETH0
I0320 23:30:13.422686 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:30:13.422698 543705 net.go:698] Add success.
I0320 23:30:13.595763 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ad5b3b1b-c348-48dc-a8c5-cc0355a25c5a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 23:30:13.595798 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 23:30:14.454983 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:30:14.455152 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:30:14.455239 543705 disk_worker.go:708] disk space is not compliant
W0320 23:30:14.455242 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:30:14.456804 543705 disk_worker.go:494] system disk:vda1
I0320 23:30:14.456840 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:30:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:30:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:30:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:30:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:30:16.472431 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:30:23.409912 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:30:23.409933 543705 memory.go:184] no items to output this cycle
I0320 23:30:23.409964 543705 cpu.go:275] no items to output this cycle
I0320 23:30:26.646732 543705 disk_info.go:125] begin check local disk info of client
I0320 23:30:26.649176 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:30:26.649192 543705 disk_info.go:196] parse disk info done, disk is : [0xc000273980 0xc0002739c0]
E0320 23:30:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:30:33.409805 543705 memory.go:184] no items to output this cycle
I0320 23:30:33.409826 543705 cpu.go:275] no items to output this cycle
I0320 23:30:38.660002 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 23:30:38.660009 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 23:30:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:30:43.410663 543705 memory.go:191] Add success.
I0320 23:30:43.409838 543705 cpu.go:282] Add success.
I0320 23:30:43.420386 543705 net.go:648] Add success.
I0320 23:30:43.423166 543705 net.go:770] primary dev: ETH0
I0320 23:30:43.423180 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:30:43.423193 543705 net.go:698] Add success.
I0320 23:30:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:30:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:30:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:30:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:30:53.409765 543705 memory.go:184] no items to output this cycle
I0320 23:30:53.409794 543705 cpu.go:275] no items to output this cycle
E0320 23:31:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:31:03.409800 543705 memory.go:184] no items to output this cycle
I0320 23:31:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 23:31:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:31:13.409782 543705 memory.go:191] Add success.
I0320 23:31:13.409806 543705 cpu.go:282] Add success.
W0320 23:31:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:31:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:31:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:31:13.420178 543705 net.go:648] Add success.
I0320 23:31:13.422883 543705 net.go:770] primary dev: ETH0
I0320 23:31:13.422895 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:31:13.422907 543705 net.go:698] Add success.
I0320 23:31:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:31:14.455204 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:31:14.455215 543705 disk_worker.go:708] disk space is not compliant
W0320 23:31:14.455218 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:31:14.456611 543705 disk_worker.go:494] system disk:vda1
I0320 23:31:14.456641 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:31:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:31:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:31:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:31:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:31:16.472450 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:31:23.409859 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:31:23.409878 543705 memory.go:184] no items to output this cycle
I0320 23:31:23.409978 543705 cpu.go:275] no items to output this cycle
I0320 23:31:26.649492 543705 disk_info.go:125] begin check local disk info of client
I0320 23:31:26.651968 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:31:26.651975 543705 disk_info.go:196] parse disk info done, disk is : [0xc000286000 0xc000286040]
E0320 23:31:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:31:33.409797 543705 memory.go:184] no items to output this cycle
I0320 23:31:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 23:31:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:31:43.409792 543705 memory.go:191] Add success.
I0320 23:31:43.409807 543705 cpu.go:282] Add success.
I0320 23:31:43.420077 543705 net.go:648] Add success.
I0320 23:31:43.422491 543705 net.go:770] primary dev: ETH0
I0320 23:31:43.422504 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:31:43.422517 543705 net.go:698] Add success.
I0320 23:31:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:31:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:31:46.458059 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:31:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:31:53.409764 543705 memory.go:184] no items to output this cycle
I0320 23:31:53.409785 543705 cpu.go:275] no items to output this cycle
E0320 23:32:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:32:03.409775 543705 memory.go:184] no items to output this cycle
I0320 23:32:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 23:32:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:32:13.409796 543705 memory.go:191] Add success.
I0320 23:32:13.409797 543705 cpu.go:282] Add success.
W0320 23:32:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:32:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:32:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:32:13.420167 543705 net.go:648] Add success.
I0320 23:32:13.423190 543705 net.go:770] primary dev: ETH0
I0320 23:32:13.423204 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:32:13.423217 543705 net.go:698] Add success.
W0320 23:32:14.455147 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:32:14.455217 543705 disk_worker.go:708] disk space is not compliant
W0320 23:32:14.455220 543705 disk_worker.go:728] disk inode is not compliant
E0320 23:32:14.457153 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
I0320 23:32:14.457163 543705 disk_worker.go:494] system disk:vda1
E0320 23:32:14.457164 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 23:32:14.457171 543705 custom_config.go:64] query custom config with name: gpu
I0320 23:32:14.457197 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 23:32:15.456844 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 23:32:15.456855 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:32:16.457946 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 23:32:16.457956 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 23:32:16.457999 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:32:16.458015 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:32:16.472368 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:32:23.409873 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:32:23.409894 543705 cpu.go:275] no items to output this cycle
I0320 23:32:23.409896 543705 memory.go:184] no items to output this cycle
I0320 23:32:26.652465 543705 disk_info.go:125] begin check local disk info of client
I0320 23:32:26.654964 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:32:26.654979 543705 disk_info.go:196] parse disk info done, disk is : [0xc000394340 0xc000394380]
E0320 23:32:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:32:33.409796 543705 memory.go:184] no items to output this cycle
I0320 23:32:33.409814 543705 cpu.go:275] no items to output this cycle
E0320 23:32:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:32:43.409782 543705 memory.go:191] Add success.
I0320 23:32:43.409803 543705 cpu.go:282] Add success.
I0320 23:32:43.419884 543705 net.go:648] Add success.
I0320 23:32:43.422458 543705 net.go:770] primary dev: ETH0
I0320 23:32:43.422473 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:32:43.422488 543705 net.go:698] Add success.
I0320 23:32:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:32:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:32:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:32:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:32:53.409766 543705 memory.go:184] no items to output this cycle
I0320 23:32:53.409800 543705 cpu.go:275] no items to output this cycle
E0320 23:33:03.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:33:03.409772 543705 memory.go:184] no items to output this cycle
I0320 23:33:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 23:33:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:33:13.409786 543705 memory.go:191] Add success.
I0320 23:33:13.409812 543705 cpu.go:282] Add success.
W0320 23:33:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:33:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:33:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:33:13.420146 543705 net.go:648] Add success.
I0320 23:33:13.422955 543705 net.go:770] primary dev: ETH0
I0320 23:33:13.422978 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:33:13.422993 543705 net.go:698] Add success.
I0320 23:33:13.463883 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"874b215e-630e-4832-ab5e-f5c89cc64cdd","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 23:33:13.463918 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 23:33:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:33:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:33:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0320 23:33:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:33:14.456508 543705 disk_worker.go:494] system disk:vda1
I0320 23:33:14.456563 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:33:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:33:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:33:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:33:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:33:16.472401 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:33:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:33:23.409810 543705 memory.go:184] no items to output this cycle
I0320 23:33:23.409813 543705 cpu.go:275] no items to output this cycle
I0320 23:33:26.655528 543705 disk_info.go:125] begin check local disk info of client
I0320 23:33:26.658080 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:33:26.658087 543705 disk_info.go:196] parse disk info done, disk is : [0xc00028e000 0xc00028e040]
E0320 23:33:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:33:33.409797 543705 memory.go:184] no items to output this cycle
I0320 23:33:33.409811 543705 cpu.go:275] no items to output this cycle
I0320 23:33:38.660148 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 23:33:38.660155 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 23:33:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:33:43.410641 543705 memory.go:191] Add success.
I0320 23:33:43.409824 543705 cpu.go:282] Add success.
I0320 23:33:43.420409 543705 net.go:648] Add success.
I0320 23:33:43.422873 543705 net.go:770] primary dev: ETH0
I0320 23:33:43.422886 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:33:43.422898 543705 net.go:698] Add success.
I0320 23:33:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:33:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:33:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:33:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:33:53.409802 543705 memory.go:184] no items to output this cycle
I0320 23:33:53.409816 543705 cpu.go:275] no items to output this cycle
E0320 23:34:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:34:03.409773 543705 memory.go:184] no items to output this cycle
I0320 23:34:03.409777 543705 cpu.go:275] no items to output this cycle
E0320 23:34:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:34:13.409797 543705 memory.go:191] Add success.
I0320 23:34:13.409803 543705 cpu.go:282] Add success.
W0320 23:34:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:34:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:34:13.409842 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:34:13.420047 543705 net.go:648] Add success.
I0320 23:34:13.422904 543705 net.go:770] primary dev: ETH0
I0320 23:34:13.422921 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:34:13.422934 543705 net.go:698] Add success.
I0320 23:34:14.454981 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:34:14.455265 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:34:14.455282 543705 disk_worker.go:708] disk space is not compliant
W0320 23:34:14.455286 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:34:14.457075 543705 disk_worker.go:494] system disk:vda1
I0320 23:34:14.457108 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:34:15.455977 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:34:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:34:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:34:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:34:16.472423 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:34:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:34:23.409785 543705 memory.go:184] no items to output this cycle
I0320 23:34:23.409801 543705 cpu.go:275] no items to output this cycle
I0320 23:34:26.658736 543705 disk_info.go:125] begin check local disk info of client
I0320 23:34:26.661247 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:34:26.661256 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003905c0 0xc000390600]
E0320 23:34:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:34:33.409799 543705 memory.go:184] no items to output this cycle
I0320 23:34:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 23:34:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:34:43.409785 543705 memory.go:191] Add success.
I0320 23:34:43.409787 543705 cpu.go:282] Add success.
I0320 23:34:43.419900 543705 net.go:648] Add success.
I0320 23:34:43.422339 543705 net.go:770] primary dev: ETH0
I0320 23:34:43.422354 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:34:43.422369 543705 net.go:698] Add success.
I0320 23:34:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:34:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:34:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:34:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:34:53.409795 543705 memory.go:184] no items to output this cycle
I0320 23:34:53.409803 543705 cpu.go:275] no items to output this cycle
E0320 23:35:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:35:03.409792 543705 memory.go:184] no items to output this cycle
I0320 23:35:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 23:35:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:35:13.409821 543705 memory.go:191] Add success.
I0320 23:35:13.409828 543705 cpu.go:282] Add success.
W0320 23:35:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:35:13.409873 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:35:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:35:13.420150 543705 net.go:648] Add success.
I0320 23:35:13.422963 543705 net.go:770] primary dev: ETH0
I0320 23:35:13.422981 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:35:13.422996 543705 net.go:698] Add success.
I0320 23:35:14.454955 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:35:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:35:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0320 23:35:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:35:14.456495 543705 disk_worker.go:494] system disk:vda1
I0320 23:35:14.456539 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:35:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:35:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:35:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:35:16.458049 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:35:16.472366 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:35:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:35:23.409809 543705 memory.go:184] no items to output this cycle
I0320 23:35:23.409811 543705 cpu.go:275] no items to output this cycle
I0320 23:35:26.661342 543705 disk_info.go:125] begin check local disk info of client
I0320 23:35:26.663992 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:35:26.663999 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ffc00 0xc0003ffc40]
E0320 23:35:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:35:33.409800 543705 memory.go:184] no items to output this cycle
I0320 23:35:33.409819 543705 cpu.go:275] no items to output this cycle
E0320 23:35:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:35:43.409792 543705 memory.go:191] Add success.
I0320 23:35:43.409792 543705 cpu.go:282] Add success.
I0320 23:35:43.419974 543705 net.go:648] Add success.
I0320 23:35:43.422717 543705 net.go:770] primary dev: ETH0
I0320 23:35:43.422730 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:35:43.422742 543705 net.go:698] Add success.
I0320 23:35:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:35:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:35:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:35:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:35:53.409782 543705 cpu.go:275] no items to output this cycle
I0320 23:35:53.409791 543705 memory.go:184] no items to output this cycle
E0320 23:36:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:36:03.409782 543705 memory.go:184] no items to output this cycle
I0320 23:36:03.409783 543705 cpu.go:275] no items to output this cycle
E0320 23:36:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:36:13.409784 543705 memory.go:191] Add success.
I0320 23:36:13.409803 543705 cpu.go:282] Add success.
W0320 23:36:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:36:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:36:13.409829 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:36:13.420604 543705 net.go:648] Add success.
I0320 23:36:13.423364 543705 net.go:770] primary dev: ETH0
I0320 23:36:13.423378 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:36:13.423390 543705 net.go:698] Add success.
I0320 23:36:13.469294 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f3bf6ab8-b14d-4571-bdbf-d5dcdab31167","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 23:36:13.469338 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 23:36:14.453944 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:36:14.455287 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:36:14.455302 543705 disk_worker.go:708] disk space is not compliant
W0320 23:36:14.455306 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:36:14.456915 543705 disk_worker.go:494] system disk:vda1
I0320 23:36:14.456945 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:36:15.455979 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:36:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:36:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:36:16.458154 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:36:16.472114 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:36:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:36:23.409791 543705 memory.go:184] no items to output this cycle
I0320 23:36:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 23:36:26.664571 543705 disk_info.go:125] begin check local disk info of client
I0320 23:36:26.667143 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:36:26.667150 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b500 0xc00007b540]
E0320 23:36:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:36:33.409766 543705 memory.go:184] no items to output this cycle
I0320 23:36:33.409793 543705 cpu.go:275] no items to output this cycle
I0320 23:36:38.660998 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 23:36:38.661005 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 23:36:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:36:43.410627 543705 memory.go:191] Add success.
I0320 23:36:43.409813 543705 cpu.go:282] Add success.
I0320 23:36:43.420326 543705 net.go:648] Add success.
I0320 23:36:43.422938 543705 net.go:770] primary dev: ETH0
I0320 23:36:43.422957 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:36:43.422972 543705 net.go:698] Add success.
I0320 23:36:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:36:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:36:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:36:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:36:53.409776 543705 memory.go:184] no items to output this cycle
I0320 23:36:53.409797 543705 cpu.go:275] no items to output this cycle
E0320 23:37:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:37:03.409808 543705 memory.go:184] no items to output this cycle
I0320 23:37:03.409820 543705 cpu.go:275] no items to output this cycle
E0320 23:37:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:37:13.409802 543705 memory.go:191] Add success.
I0320 23:37:13.409802 543705 cpu.go:282] Add success.
W0320 23:37:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:37:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:37:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:37:13.420162 543705 net.go:648] Add success.
I0320 23:37:13.423043 543705 net.go:770] primary dev: ETH0
I0320 23:37:13.423058 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:37:13.423074 543705 net.go:698] Add success.
I0320 23:37:13.453613 543705 event_worker.go:152] Polling the log file for events...
W0320 23:37:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:37:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0320 23:37:14.455174 543705 disk_worker.go:728] disk inode is not compliant
E0320 23:37:14.456942 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 23:37:14.456951 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 23:37:14.456957 543705 custom_config.go:64] query custom config with name: gpu
I0320 23:37:14.457002 543705 disk_worker.go:494] system disk:vda1
I0320 23:37:14.457055 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 23:37:15.456805 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 23:37:15.456813 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:37:16.457958 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 23:37:16.457962 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 23:37:16.458012 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:37:16.458033 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:37:16.472358 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:37:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:37:23.409793 543705 cpu.go:275] no items to output this cycle
I0320 23:37:23.409802 543705 memory.go:184] no items to output this cycle
I0320 23:37:26.667560 543705 disk_info.go:125] begin check local disk info of client
I0320 23:37:26.670066 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:37:26.670073 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa480 0xc0001fa4c0]
E0320 23:37:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:37:33.409777 543705 memory.go:184] no items to output this cycle
I0320 23:37:33.409794 543705 cpu.go:275] no items to output this cycle
E0320 23:37:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:37:43.409791 543705 memory.go:191] Add success.
I0320 23:37:43.409814 543705 cpu.go:282] Add success.
I0320 23:37:43.419850 543705 net.go:648] Add success.
I0320 23:37:43.422570 543705 net.go:770] primary dev: ETH0
I0320 23:37:43.422583 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:37:43.422595 543705 net.go:698] Add success.
I0320 23:37:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:37:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:37:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:37:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:37:53.409775 543705 memory.go:184] no items to output this cycle
I0320 23:37:53.409774 543705 cpu.go:275] no items to output this cycle
E0320 23:38:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:38:03.409797 543705 memory.go:184] no items to output this cycle
I0320 23:38:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 23:38:13.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:38:13.409835 543705 memory.go:191] Add success.
I0320 23:38:13.409844 543705 cpu.go:282] Add success.
W0320 23:38:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:38:13.409884 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:38:13.409888 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:38:13.420244 543705 net.go:648] Add success.
I0320 23:38:13.423067 543705 net.go:770] primary dev: ETH0
I0320 23:38:13.423082 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:38:13.423098 543705 net.go:698] Add success.
I0320 23:38:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:38:14.455135 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:38:14.455211 543705 disk_worker.go:708] disk space is not compliant
W0320 23:38:14.455214 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:38:14.456866 543705 disk_worker.go:494] system disk:vda1
I0320 23:38:14.456912 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:38:15.456029 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:38:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:38:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:38:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:38:16.472441 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:38:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:38:23.409782 543705 memory.go:184] no items to output this cycle
I0320 23:38:23.409803 543705 cpu.go:275] no items to output this cycle
I0320 23:38:26.670728 543705 disk_info.go:125] begin check local disk info of client
I0320 23:38:26.673542 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:38:26.673549 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035f340 0xc00035f380]
E0320 23:38:33.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:38:33.409761 543705 memory.go:184] no items to output this cycle
I0320 23:38:33.409799 543705 cpu.go:275] no items to output this cycle
E0320 23:38:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:38:43.409794 543705 memory.go:191] Add success.
I0320 23:38:43.409813 543705 cpu.go:282] Add success.
I0320 23:38:43.419986 543705 net.go:648] Add success.
I0320 23:38:43.422615 543705 net.go:770] primary dev: ETH0
I0320 23:38:43.422637 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:38:43.422651 543705 net.go:698] Add success.
I0320 23:38:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:38:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:38:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:38:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:38:53.409792 543705 memory.go:184] no items to output this cycle
I0320 23:38:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 23:39:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:39:03.409778 543705 memory.go:184] no items to output this cycle
I0320 23:39:03.409806 543705 cpu.go:275] no items to output this cycle
E0320 23:39:13.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:39:13.409818 543705 cpu.go:282] Add success.
I0320 23:39:13.409820 543705 memory.go:191] Add success.
W0320 23:39:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:39:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:39:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:39:13.420126 543705 net.go:648] Add success.
I0320 23:39:13.422815 543705 net.go:770] primary dev: ETH0
I0320 23:39:13.422827 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:39:13.422840 543705 net.go:698] Add success.
I0320 23:39:13.463791 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"995feba5-bcb8-43e3-aa87-54e13322ca3f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 23:39:13.463826 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 23:39:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:39:14.455233 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:39:14.455303 543705 disk_worker.go:708] disk space is not compliant
W0320 23:39:14.455306 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:39:14.457381 543705 disk_worker.go:494] system disk:vda1
I0320 23:39:14.457408 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:39:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:39:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:39:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:39:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:39:16.472374 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:39:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:39:23.409802 543705 memory.go:184] no items to output this cycle
I0320 23:39:23.409812 543705 cpu.go:275] no items to output this cycle
I0320 23:39:26.673676 543705 disk_info.go:125] begin check local disk info of client
I0320 23:39:26.676245 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:39:26.676252 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e7cc0 0xc0003e7d00]
E0320 23:39:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:39:33.409812 543705 memory.go:184] no items to output this cycle
I0320 23:39:33.409828 543705 cpu.go:275] no items to output this cycle
I0320 23:39:38.661738 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 23:39:38.661745 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 23:39:43.409807 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:39:43.410609 543705 memory.go:191] Add success.
I0320 23:39:43.409846 543705 cpu.go:282] Add success.
I0320 23:39:43.420292 543705 net.go:648] Add success.
I0320 23:39:43.422967 543705 net.go:770] primary dev: ETH0
I0320 23:39:43.422980 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:39:43.422994 543705 net.go:698] Add success.
I0320 23:39:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:39:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:39:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:39:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:39:53.409779 543705 memory.go:184] no items to output this cycle
I0320 23:39:53.409808 543705 cpu.go:275] no items to output this cycle
E0320 23:40:03.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:40:03.409765 543705 memory.go:184] no items to output this cycle
I0320 23:40:03.409804 543705 cpu.go:275] no items to output this cycle
E0320 23:40:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:40:13.409826 543705 memory.go:191] Add success.
I0320 23:40:13.409829 543705 cpu.go:282] Add success.
W0320 23:40:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:40:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:40:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:40:13.420149 543705 net.go:648] Add success.
I0320 23:40:13.423095 543705 net.go:770] primary dev: ETH0
I0320 23:40:13.423109 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:40:13.423121 543705 net.go:698] Add success.
W0320 23:40:14.455174 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:40:14.455271 543705 disk_worker.go:708] disk space is not compliant
W0320 23:40:14.455275 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:40:14.455641 543705 custom_config.go:64] query custom config with name: gpu
I0320 23:40:14.457548 543705 disk_worker.go:494] system disk:vda1
I0320 23:40:14.457589 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:40:15.455980 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:40:16.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:40:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:40:16.458074 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:40:16.472458 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:40:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:40:23.409778 543705 memory.go:184] no items to output this cycle
I0320 23:40:23.409795 543705 cpu.go:275] no items to output this cycle
I0320 23:40:26.676334 543705 disk_info.go:125] begin check local disk info of client
I0320 23:40:26.678899 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:40:26.678914 543705 disk_info.go:196] parse disk info done, disk is : [0xc00029e100 0xc00029e140]
E0320 23:40:33.409813 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:40:33.409850 543705 memory.go:184] no items to output this cycle
I0320 23:40:33.409855 543705 cpu.go:275] no items to output this cycle
E0320 23:40:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:40:43.409783 543705 memory.go:191] Add success.
I0320 23:40:43.409787 543705 cpu.go:282] Add success.
I0320 23:40:43.419900 543705 net.go:648] Add success.
I0320 23:40:43.422641 543705 net.go:770] primary dev: ETH0
I0320 23:40:43.422657 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:40:43.422672 543705 net.go:698] Add success.
I0320 23:40:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:40:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:40:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:40:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:40:53.409792 543705 memory.go:184] no items to output this cycle
I0320 23:40:53.409802 543705 cpu.go:275] no items to output this cycle
E0320 23:41:03.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:41:03.409764 543705 memory.go:184] no items to output this cycle
I0320 23:41:03.409801 543705 cpu.go:275] no items to output this cycle
E0320 23:41:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:41:13.409827 543705 memory.go:191] Add success.
I0320 23:41:13.409831 543705 cpu.go:282] Add success.
W0320 23:41:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:41:13.409877 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:41:13.409881 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:41:13.420248 543705 net.go:648] Add success.
I0320 23:41:13.422973 543705 net.go:770] primary dev: ETH0
I0320 23:41:13.422986 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:41:13.422998 543705 net.go:698] Add success.
I0320 23:41:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:41:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:41:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0320 23:41:14.455199 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:41:14.456570 543705 disk_worker.go:494] system disk:vda1
I0320 23:41:14.456599 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:41:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:41:16.458001 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:41:16.458070 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:41:16.458096 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:41:16.472458 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:41:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:41:23.409783 543705 memory.go:184] no items to output this cycle
I0320 23:41:23.409787 543705 cpu.go:275] no items to output this cycle
I0320 23:41:26.679590 543705 disk_info.go:125] begin check local disk info of client
I0320 23:41:26.682442 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:41:26.682449 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e7480 0xc0003e74c0]
E0320 23:41:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:41:33.409776 543705 memory.go:184] no items to output this cycle
I0320 23:41:33.409781 543705 cpu.go:275] no items to output this cycle
E0320 23:41:43.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:41:43.409806 543705 memory.go:191] Add success.
I0320 23:41:43.409816 543705 cpu.go:282] Add success.
I0320 23:41:43.419840 543705 net.go:648] Add success.
I0320 23:41:43.422607 543705 net.go:770] primary dev: ETH0
I0320 23:41:43.422620 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:41:43.422633 543705 net.go:698] Add success.
I0320 23:41:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:41:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:41:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:41:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:41:53.409791 543705 memory.go:184] no items to output this cycle
I0320 23:41:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 23:42:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:42:03.409771 543705 memory.go:184] no items to output this cycle
I0320 23:42:03.409796 543705 cpu.go:275] no items to output this cycle
E0320 23:42:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:42:13.409816 543705 memory.go:191] Add success.
I0320 23:42:13.409826 543705 cpu.go:282] Add success.
W0320 23:42:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:42:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:42:13.409862 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:42:13.420166 543705 net.go:648] Add success.
I0320 23:42:13.422961 543705 net.go:770] primary dev: ETH0
I0320 23:42:13.422973 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:42:13.422991 543705 net.go:698] Add success.
I0320 23:42:13.469163 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f1cac995-f20b-490a-beb8-b3e052200a8d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 23:42:13.469194 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 23:42:14.455225 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:42:14.455237 543705 disk_worker.go:708] disk space is not compliant
W0320 23:42:14.455240 543705 disk_worker.go:728] disk inode is not compliant
E0320 23:42:14.456259 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 23:42:14.456269 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 23:42:14.456274 543705 custom_config.go:64] query custom config with name: gpu
I0320 23:42:14.457205 543705 disk_worker.go:494] system disk:vda1
I0320 23:42:14.457230 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 23:42:15.457026 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 23:42:15.457048 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:42:16.457930 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 23:42:16.457933 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 23:42:16.457989 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:42:16.458009 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:42:16.472374 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:42:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:42:23.409802 543705 memory.go:184] no items to output this cycle
I0320 23:42:23.409813 543705 cpu.go:275] no items to output this cycle
I0320 23:42:26.683560 543705 disk_info.go:125] begin check local disk info of client
I0320 23:42:26.686071 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:42:26.686077 543705 disk_info.go:196] parse disk info done, disk is : [0xc000492440 0xc000492480]
E0320 23:42:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:42:33.409805 543705 memory.go:184] no items to output this cycle
I0320 23:42:33.409824 543705 cpu.go:275] no items to output this cycle
I0320 23:42:38.663008 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 23:42:38.663015 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 23:42:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:42:43.410812 543705 memory.go:191] Add success.
I0320 23:42:43.409817 543705 cpu.go:282] Add success.
I0320 23:42:43.420495 543705 net.go:648] Add success.
I0320 23:42:43.423447 543705 net.go:770] primary dev: ETH0
I0320 23:42:43.423460 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:42:43.423473 543705 net.go:698] Add success.
I0320 23:42:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:42:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:42:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:42:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:42:53.409797 543705 memory.go:184] no items to output this cycle
I0320 23:42:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 23:43:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:43:03.409772 543705 memory.go:184] no items to output this cycle
I0320 23:43:03.409780 543705 cpu.go:275] no items to output this cycle
E0320 23:43:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:43:13.409793 543705 memory.go:191] Add success.
I0320 23:43:13.409793 543705 cpu.go:282] Add success.
W0320 23:43:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:43:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:43:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:43:13.420170 543705 net.go:648] Add success.
I0320 23:43:13.422941 543705 net.go:770] primary dev: ETH0
I0320 23:43:13.422958 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:43:13.422972 543705 net.go:698] Add success.
I0320 23:43:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:43:14.455394 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:43:14.455413 543705 disk_worker.go:708] disk space is not compliant
W0320 23:43:14.455418 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:43:14.457017 543705 disk_worker.go:494] system disk:vda1
I0320 23:43:14.457049 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:43:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:43:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:43:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:43:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:43:16.472393 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:43:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:43:23.409807 543705 memory.go:184] no items to output this cycle
I0320 23:43:23.409817 543705 cpu.go:275] no items to output this cycle
I0320 23:43:26.686734 543705 disk_info.go:125] begin check local disk info of client
I0320 23:43:26.689249 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:43:26.689257 543705 disk_info.go:196] parse disk info done, disk is : [0xc00048a740 0xc00048a780]
E0320 23:43:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:43:33.409776 543705 memory.go:184] no items to output this cycle
I0320 23:43:33.409788 543705 cpu.go:275] no items to output this cycle
E0320 23:43:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:43:43.409795 543705 memory.go:191] Add success.
I0320 23:43:43.409815 543705 cpu.go:282] Add success.
I0320 23:43:43.419955 543705 net.go:648] Add success.
I0320 23:43:43.423226 543705 net.go:770] primary dev: ETH0
I0320 23:43:43.423238 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:43:43.423251 543705 net.go:698] Add success.
I0320 23:43:46.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:43:46.458063 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:43:46.458090 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:43:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:43:53.409776 543705 memory.go:184] no items to output this cycle
I0320 23:43:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 23:44:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:44:03.409800 543705 memory.go:184] no items to output this cycle
I0320 23:44:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 23:44:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:44:13.409792 543705 memory.go:191] Add success.
I0320 23:44:13.409808 543705 cpu.go:282] Add success.
W0320 23:44:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:44:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:44:13.409835 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:44:13.420137 543705 net.go:648] Add success.
I0320 23:44:13.423180 543705 net.go:770] primary dev: ETH0
I0320 23:44:13.423195 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:44:13.423209 543705 net.go:698] Add success.
I0320 23:44:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:44:14.455331 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:44:14.455345 543705 disk_worker.go:708] disk space is not compliant
W0320 23:44:14.455349 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:44:14.457064 543705 disk_worker.go:494] system disk:vda1
I0320 23:44:14.457093 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:44:15.456018 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:44:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:44:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:44:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:44:16.472450 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:44:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:44:23.409777 543705 memory.go:184] no items to output this cycle
I0320 23:44:23.409798 543705 cpu.go:275] no items to output this cycle
I0320 23:44:26.689652 543705 disk_info.go:125] begin check local disk info of client
I0320 23:44:26.692224 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:44:26.692237 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e69c0 0xc0003e6a00]
E0320 23:44:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:44:33.409775 543705 memory.go:184] no items to output this cycle
I0320 23:44:33.409784 543705 cpu.go:275] no items to output this cycle
E0320 23:44:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:44:43.409807 543705 memory.go:191] Add success.
I0320 23:44:43.409815 543705 cpu.go:282] Add success.
I0320 23:44:43.419882 543705 net.go:648] Add success.
I0320 23:44:43.422515 543705 net.go:770] primary dev: ETH0
I0320 23:44:43.422531 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:44:43.422544 543705 net.go:698] Add success.
I0320 23:44:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:44:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:44:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:44:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:44:53.409777 543705 memory.go:184] no items to output this cycle
I0320 23:44:53.409779 543705 cpu.go:275] no items to output this cycle
E0320 23:45:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:45:03.409768 543705 memory.go:184] no items to output this cycle
I0320 23:45:03.409791 543705 cpu.go:275] no items to output this cycle
E0320 23:45:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:45:13.409813 543705 memory.go:191] Add success.
I0320 23:45:13.409824 543705 cpu.go:282] Add success.
W0320 23:45:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:45:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:45:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:45:13.420143 543705 net.go:648] Add success.
I0320 23:45:13.422764 543705 net.go:770] primary dev: ETH0
I0320 23:45:13.422777 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:45:13.422790 543705 net.go:698] Add success.
I0320 23:45:13.470307 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2e5183aa-ba7d-4a3f-8b83-82f282333681","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 23:45:13.470343 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 23:45:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:45:14.455287 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:45:14.455375 543705 disk_worker.go:708] disk space is not compliant
W0320 23:45:14.455389 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:45:14.457212 543705 disk_worker.go:494] system disk:vda1
I0320 23:45:14.457243 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:45:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:45:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:45:16.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:45:16.458078 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:45:16.472465 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:45:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:45:23.409788 543705 cpu.go:275] no items to output this cycle
I0320 23:45:23.409795 543705 memory.go:184] no items to output this cycle
I0320 23:45:26.692724 543705 disk_info.go:125] begin check local disk info of client
I0320 23:45:26.695490 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:45:26.695499 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003318c0 0xc000331900]
E0320 23:45:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:45:33.409770 543705 memory.go:184] no items to output this cycle
I0320 23:45:33.409801 543705 cpu.go:275] no items to output this cycle
I0320 23:45:38.664016 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 23:45:38.664023 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 23:45:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:45:43.410731 543705 memory.go:191] Add success.
I0320 23:45:43.409823 543705 cpu.go:282] Add success.
I0320 23:45:43.420459 543705 net.go:648] Add success.
I0320 23:45:43.423274 543705 net.go:770] primary dev: ETH0
I0320 23:45:43.423288 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:45:43.423300 543705 net.go:698] Add success.
I0320 23:45:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:45:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:45:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:45:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:45:53.409793 543705 memory.go:184] no items to output this cycle
I0320 23:45:53.409805 543705 cpu.go:275] no items to output this cycle
E0320 23:46:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:46:03.409780 543705 memory.go:184] no items to output this cycle
I0320 23:46:03.409781 543705 cpu.go:275] no items to output this cycle
W0320 23:46:13.409710 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:46:13.409733 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:46:13.409739 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:46:13.409832 543705 cpu.go:282] Add success.
E0320 23:46:13.409836 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:46:13.409856 543705 memory.go:191] Add success.
I0320 23:46:13.420116 543705 net.go:648] Add success.
I0320 23:46:13.422685 543705 net.go:770] primary dev: ETH0
I0320 23:46:13.422698 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:46:13.422710 543705 net.go:698] Add success.
I0320 23:46:14.453958 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:46:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:46:14.455237 543705 disk_worker.go:708] disk space is not compliant
W0320 23:46:14.455240 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:46:14.456618 543705 disk_worker.go:494] system disk:vda1
I0320 23:46:14.456647 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:46:15.455983 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:46:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:46:16.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:46:16.458077 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:46:16.472457 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:46:23.410348 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:46:23.410364 543705 memory.go:184] no items to output this cycle
I0320 23:46:23.410378 543705 cpu.go:275] no items to output this cycle
I0320 23:46:26.696706 543705 disk_info.go:125] begin check local disk info of client
I0320 23:46:26.699266 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:46:26.699273 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002ab200 0xc0002ab240]
E0320 23:46:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:46:33.409798 543705 memory.go:184] no items to output this cycle
I0320 23:46:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 23:46:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:46:43.409812 543705 memory.go:191] Add success.
I0320 23:46:43.409821 543705 cpu.go:282] Add success.
I0320 23:46:43.419826 543705 net.go:648] Add success.
I0320 23:46:43.422586 543705 net.go:770] primary dev: ETH0
I0320 23:46:43.422599 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:46:43.422612 543705 net.go:698] Add success.
I0320 23:46:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:46:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:46:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:46:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:46:53.409772 543705 memory.go:184] no items to output this cycle
I0320 23:46:53.409787 543705 cpu.go:275] no items to output this cycle
E0320 23:47:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:47:03.409787 543705 cpu.go:275] no items to output this cycle
I0320 23:47:03.409791 543705 memory.go:184] no items to output this cycle
E0320 23:47:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:47:13.409820 543705 memory.go:191] Add success.
I0320 23:47:13.409826 543705 cpu.go:282] Add success.
W0320 23:47:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:47:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:47:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:47:13.420267 543705 net.go:648] Add success.
I0320 23:47:13.423134 543705 net.go:770] primary dev: ETH0
I0320 23:47:13.423149 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:47:13.423163 543705 net.go:698] Add success.
I0320 23:47:13.453664 543705 event_worker.go:152] Polling the log file for events...
W0320 23:47:14.455223 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:47:14.455237 543705 disk_worker.go:708] disk space is not compliant
W0320 23:47:14.455241 543705 disk_worker.go:728] disk inode is not compliant
E0320 23:47:14.456024 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 23:47:14.456034 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 23:47:14.456039 543705 custom_config.go:64] query custom config with name: gpu
I0320 23:47:14.456856 543705 disk_worker.go:494] system disk:vda1
I0320 23:47:14.456884 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 23:47:15.456785 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 23:47:15.456794 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:47:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 23:47:16.457968 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 23:47:16.458021 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:47:16.458039 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:47:16.472366 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:47:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:47:23.409799 543705 memory.go:184] no items to output this cycle
I0320 23:47:23.409811 543705 cpu.go:275] no items to output this cycle
I0320 23:47:26.699740 543705 disk_info.go:125] begin check local disk info of client
I0320 23:47:26.702241 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:47:26.702247 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb200 0xc0001fb240]
E0320 23:47:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:47:33.409800 543705 memory.go:184] no items to output this cycle
I0320 23:47:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 23:47:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:47:43.409816 543705 memory.go:191] Add success.
I0320 23:47:43.409836 543705 cpu.go:282] Add success.
I0320 23:47:43.419930 543705 net.go:648] Add success.
I0320 23:47:43.422636 543705 net.go:770] primary dev: ETH0
I0320 23:47:43.422651 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:47:43.422665 543705 net.go:698] Add success.
I0320 23:47:46.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:47:46.458067 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:47:46.458094 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:47:53.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:47:53.409768 543705 memory.go:184] no items to output this cycle
I0320 23:47:53.409804 543705 cpu.go:275] no items to output this cycle
E0320 23:48:03.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:48:03.409764 543705 memory.go:184] no items to output this cycle
I0320 23:48:03.409794 543705 cpu.go:275] no items to output this cycle
E0320 23:48:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:48:13.409820 543705 memory.go:191] Add success.
I0320 23:48:13.409821 543705 cpu.go:282] Add success.
W0320 23:48:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:48:13.409862 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:48:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:48:13.420128 543705 net.go:648] Add success.
I0320 23:48:13.422801 543705 net.go:770] primary dev: ETH0
I0320 23:48:13.422815 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:48:13.422828 543705 net.go:698] Add success.
I0320 23:48:13.468732 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"372f63ab-a9b5-45f4-9201-fc626e2129ac","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 23:48:13.468767 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 23:48:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:48:14.455285 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:48:14.455350 543705 disk_worker.go:708] disk space is not compliant
W0320 23:48:14.455355 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:48:14.456906 543705 disk_worker.go:494] system disk:vda1
I0320 23:48:14.456949 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:48:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:48:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:48:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:48:16.458079 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:48:16.472429 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:48:23.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:48:23.409817 543705 memory.go:184] no items to output this cycle
I0320 23:48:23.409828 543705 cpu.go:275] no items to output this cycle
I0320 23:48:26.702741 543705 disk_info.go:125] begin check local disk info of client
I0320 23:48:26.705278 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:48:26.705284 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e67c0 0xc0003e6800]
E0320 23:48:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:48:33.409797 543705 memory.go:184] no items to output this cycle
I0320 23:48:33.409811 543705 cpu.go:275] no items to output this cycle
I0320 23:48:38.665023 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 23:48:38.665029 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 23:48:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:48:43.410621 543705 memory.go:191] Add success.
I0320 23:48:43.409793 543705 cpu.go:282] Add success.
I0320 23:48:43.420341 543705 net.go:648] Add success.
I0320 23:48:43.422849 543705 net.go:770] primary dev: ETH0
I0320 23:48:43.422865 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:48:43.422879 543705 net.go:698] Add success.
I0320 23:48:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:48:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:48:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:48:53.410369 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:48:53.410390 543705 memory.go:184] no items to output this cycle
I0320 23:48:53.410400 543705 cpu.go:275] no items to output this cycle
E0320 23:49:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:49:03.409790 543705 memory.go:184] no items to output this cycle
I0320 23:49:03.409814 543705 cpu.go:275] no items to output this cycle
E0320 23:49:13.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:49:13.409822 543705 memory.go:191] Add success.
I0320 23:49:13.409822 543705 cpu.go:282] Add success.
W0320 23:49:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:49:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:49:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:49:13.420195 543705 net.go:648] Add success.
I0320 23:49:13.423089 543705 net.go:770] primary dev: ETH0
I0320 23:49:13.423103 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:49:13.423116 543705 net.go:698] Add success.
I0320 23:49:14.454944 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:49:14.455136 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:49:14.455147 543705 disk_worker.go:708] disk space is not compliant
W0320 23:49:14.455150 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:49:14.456466 543705 disk_worker.go:494] system disk:vda1
I0320 23:49:14.456508 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:49:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:49:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:49:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:49:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:49:16.472410 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:49:23.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:49:23.409812 543705 memory.go:184] no items to output this cycle
I0320 23:49:23.409821 543705 cpu.go:275] no items to output this cycle
I0320 23:49:26.705671 543705 disk_info.go:125] begin check local disk info of client
I0320 23:49:26.708161 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:49:26.708167 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fd440 0xc0001fd480]
E0320 23:49:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:49:33.409800 543705 memory.go:184] no items to output this cycle
I0320 23:49:33.409813 543705 cpu.go:275] no items to output this cycle
E0320 23:49:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:49:43.409782 543705 memory.go:191] Add success.
I0320 23:49:43.409823 543705 cpu.go:282] Add success.
I0320 23:49:43.420239 543705 net.go:648] Add success.
I0320 23:49:43.423015 543705 net.go:770] primary dev: ETH0
I0320 23:49:43.423027 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:49:43.423042 543705 net.go:698] Add success.
I0320 23:49:46.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:49:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:49:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:49:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:49:53.409766 543705 memory.go:184] no items to output this cycle
I0320 23:49:53.409795 543705 cpu.go:275] no items to output this cycle
E0320 23:50:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:50:03.409793 543705 memory.go:184] no items to output this cycle
I0320 23:50:03.409805 543705 cpu.go:275] no items to output this cycle
E0320 23:50:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:50:13.409796 543705 memory.go:191] Add success.
I0320 23:50:13.409816 543705 cpu.go:282] Add success.
W0320 23:50:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:50:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:50:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:50:13.420174 543705 net.go:648] Add success.
I0320 23:50:13.422823 543705 net.go:770] primary dev: ETH0
I0320 23:50:13.422836 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:50:13.422848 543705 net.go:698] Add success.
I0320 23:50:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:50:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:50:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0320 23:50:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:50:14.456608 543705 disk_worker.go:494] system disk:vda1
I0320 23:50:14.456652 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:50:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:50:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:50:16.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:50:16.458077 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:50:16.472440 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:50:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:50:23.409769 543705 memory.go:184] no items to output this cycle
I0320 23:50:23.409800 543705 cpu.go:275] no items to output this cycle
I0320 23:50:26.708760 543705 disk_info.go:125] begin check local disk info of client
I0320 23:50:26.711301 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:50:26.711317 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e6140 0xc0003e6180]
E0320 23:50:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:50:33.409788 543705 memory.go:184] no items to output this cycle
I0320 23:50:33.409804 543705 cpu.go:275] no items to output this cycle
E0320 23:50:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:50:43.409812 543705 memory.go:191] Add success.
I0320 23:50:43.409819 543705 cpu.go:282] Add success.
I0320 23:50:43.420043 543705 net.go:648] Add success.
I0320 23:50:43.422719 543705 net.go:770] primary dev: ETH0
I0320 23:50:43.422732 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:50:43.422745 543705 net.go:698] Add success.
I0320 23:50:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:50:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:50:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:50:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:50:53.409796 543705 memory.go:184] no items to output this cycle
I0320 23:50:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 23:51:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:51:03.409773 543705 memory.go:184] no items to output this cycle
I0320 23:51:03.409775 543705 cpu.go:275] no items to output this cycle
E0320 23:51:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:51:13.409817 543705 memory.go:191] Add success.
I0320 23:51:13.409828 543705 cpu.go:282] Add success.
W0320 23:51:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:51:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:51:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:51:13.420103 543705 net.go:648] Add success.
I0320 23:51:13.423099 543705 net.go:770] primary dev: ETH0
I0320 23:51:13.423114 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:51:13.423126 543705 net.go:698] Add success.
I0320 23:51:13.470701 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4d1e14ba-7a0f-4f1c-aa35-2c2139565c2f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 23:51:13.470744 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 23:51:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:51:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:51:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0320 23:51:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:51:14.456693 543705 disk_worker.go:494] system disk:vda1
I0320 23:51:14.456722 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:51:15.455614 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:51:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:51:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:51:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:51:16.472402 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:51:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:51:23.409779 543705 memory.go:184] no items to output this cycle
I0320 23:51:23.409808 543705 cpu.go:275] no items to output this cycle
I0320 23:51:26.711806 543705 disk_info.go:125] begin check local disk info of client
I0320 23:51:26.714414 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:51:26.714421 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003853c0 0xc000385400]
E0320 23:51:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:51:33.409768 543705 memory.go:184] no items to output this cycle
I0320 23:51:33.409792 543705 cpu.go:275] no items to output this cycle
I0320 23:51:38.665735 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 23:51:38.665742 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 23:51:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:51:43.410589 543705 memory.go:191] Add success.
I0320 23:51:43.409812 543705 cpu.go:282] Add success.
I0320 23:51:43.420279 543705 net.go:648] Add success.
I0320 23:51:43.423139 543705 net.go:770] primary dev: ETH0
I0320 23:51:43.423152 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:51:43.423165 543705 net.go:698] Add success.
I0320 23:51:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:51:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:51:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:51:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:51:53.409773 543705 memory.go:184] no items to output this cycle
I0320 23:51:53.409791 543705 cpu.go:275] no items to output this cycle
E0320 23:52:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:52:03.409768 543705 memory.go:184] no items to output this cycle
I0320 23:52:03.409793 543705 cpu.go:275] no items to output this cycle
E0320 23:52:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:52:13.409786 543705 memory.go:191] Add success.
I0320 23:52:13.409803 543705 cpu.go:282] Add success.
W0320 23:52:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:52:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:52:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:52:13.420091 543705 net.go:648] Add success.
I0320 23:52:13.422791 543705 net.go:770] primary dev: ETH0
I0320 23:52:13.422805 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:52:13.422820 543705 net.go:698] Add success.
W0320 23:52:14.455101 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:52:14.455160 543705 disk_worker.go:708] disk space is not compliant
W0320 23:52:14.455163 543705 disk_worker.go:728] disk inode is not compliant
E0320 23:52:14.456615 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 23:52:14.456624 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 23:52:14.456630 543705 custom_config.go:64] query custom config with name: gpu
I0320 23:52:14.457061 543705 disk_worker.go:494] system disk:vda1
I0320 23:52:14.457090 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 23:52:15.456979 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 23:52:15.456994 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:52:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 23:52:16.457976 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 23:52:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:52:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:52:16.472411 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:52:23.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:52:23.409805 543705 memory.go:184] no items to output this cycle
I0320 23:52:23.409816 543705 cpu.go:275] no items to output this cycle
I0320 23:52:26.714744 543705 disk_info.go:125] begin check local disk info of client
I0320 23:52:26.717253 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:52:26.717259 543705 disk_info.go:196] parse disk info done, disk is : [0xc000464540 0xc000464580]
E0320 23:52:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:52:33.409804 543705 memory.go:184] no items to output this cycle
I0320 23:52:33.409818 543705 cpu.go:275] no items to output this cycle
E0320 23:52:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:52:43.409804 543705 memory.go:191] Add success.
I0320 23:52:43.409811 543705 cpu.go:282] Add success.
I0320 23:52:43.419996 543705 net.go:648] Add success.
I0320 23:52:43.422770 543705 net.go:770] primary dev: ETH0
I0320 23:52:43.422784 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:52:43.422797 543705 net.go:698] Add success.
I0320 23:52:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:52:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:52:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:52:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:52:53.409773 543705 memory.go:184] no items to output this cycle
I0320 23:52:53.409778 543705 cpu.go:275] no items to output this cycle
E0320 23:53:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:53:03.409800 543705 memory.go:184] no items to output this cycle
I0320 23:53:03.409810 543705 cpu.go:275] no items to output this cycle
E0320 23:53:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:53:13.409827 543705 memory.go:191] Add success.
I0320 23:53:13.409829 543705 cpu.go:282] Add success.
W0320 23:53:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:53:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:53:13.409873 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:53:13.420170 543705 net.go:648] Add success.
I0320 23:53:13.422952 543705 net.go:770] primary dev: ETH0
I0320 23:53:13.422966 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:53:13.422978 543705 net.go:698] Add success.
I0320 23:53:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:53:14.455370 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:53:14.455387 543705 disk_worker.go:708] disk space is not compliant
W0320 23:53:14.455390 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:53:14.456993 543705 disk_worker.go:494] system disk:vda1
I0320 23:53:14.457022 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:53:15.455950 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:53:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:53:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:53:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:53:16.472386 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:53:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:53:23.409788 543705 memory.go:184] no items to output this cycle
I0320 23:53:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 23:53:26.717679 543705 disk_info.go:125] begin check local disk info of client
I0320 23:53:26.720192 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:53:26.720198 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007be00 0xc00007be40]
E0320 23:53:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:53:33.409803 543705 memory.go:184] no items to output this cycle
I0320 23:53:33.409815 543705 cpu.go:275] no items to output this cycle
E0320 23:53:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:53:43.409820 543705 memory.go:191] Add success.
I0320 23:53:43.409829 543705 cpu.go:282] Add success.
I0320 23:53:43.419971 543705 net.go:648] Add success.
I0320 23:53:43.422658 543705 net.go:770] primary dev: ETH0
I0320 23:53:43.422671 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:53:43.422683 543705 net.go:698] Add success.
I0320 23:53:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:53:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:53:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:53:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:53:53.409781 543705 memory.go:184] no items to output this cycle
I0320 23:53:53.409799 543705 cpu.go:275] no items to output this cycle
E0320 23:54:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:54:03.409783 543705 memory.go:184] no items to output this cycle
I0320 23:54:03.409817 543705 cpu.go:275] no items to output this cycle
E0320 23:54:13.409801 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:54:13.409837 543705 memory.go:191] Add success.
I0320 23:54:13.409853 543705 cpu.go:282] Add success.
W0320 23:54:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:54:13.409886 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:54:13.409891 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:54:13.420208 543705 net.go:648] Add success.
I0320 23:54:13.422868 543705 net.go:770] primary dev: ETH0
I0320 23:54:13.422879 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:54:13.422902 543705 net.go:698] Add success.
I0320 23:54:13.468828 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a6f706eb-63a8-4abc-9d5f-28f0d5c8f296","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 23:54:13.468860 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0320 23:54:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:54:14.455140 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:54:14.455215 543705 disk_worker.go:708] disk space is not compliant
W0320 23:54:14.455218 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:54:14.456624 543705 disk_worker.go:494] system disk:vda1
I0320 23:54:14.456652 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:54:15.455988 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:54:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:54:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:54:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:54:16.472429 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:54:23.410419 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:54:23.410435 543705 memory.go:184] no items to output this cycle
I0320 23:54:23.410453 543705 cpu.go:275] no items to output this cycle
I0320 23:54:26.720758 543705 disk_info.go:125] begin check local disk info of client
I0320 23:54:26.723306 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:54:26.723312 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004716c0 0xc000471700]
E0320 23:54:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:54:33.409774 543705 memory.go:184] no items to output this cycle
I0320 23:54:33.409799 543705 cpu.go:275] no items to output this cycle
I0320 23:54:38.667031 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 23:54:38.667037 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 23:54:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:54:43.410659 543705 memory.go:191] Add success.
I0320 23:54:43.409833 543705 cpu.go:282] Add success.
I0320 23:54:43.420419 543705 net.go:648] Add success.
I0320 23:54:43.422954 543705 net.go:770] primary dev: ETH0
I0320 23:54:43.422967 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:54:43.422979 543705 net.go:698] Add success.
I0320 23:54:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:54:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:54:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:54:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:54:53.409775 543705 memory.go:184] no items to output this cycle
I0320 23:54:53.409778 543705 cpu.go:275] no items to output this cycle
E0320 23:55:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:55:03.409783 543705 memory.go:184] no items to output this cycle
I0320 23:55:03.409785 543705 cpu.go:275] no items to output this cycle
E0320 23:55:13.409887 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:55:13.409916 543705 memory.go:191] Add success.
I0320 23:55:13.409922 543705 cpu.go:282] Add success.
W0320 23:55:13.409951 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:55:13.409974 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:55:13.409979 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:55:13.419729 543705 net.go:648] Add success.
I0320 23:55:13.422533 543705 net.go:770] primary dev: ETH0
I0320 23:55:13.422547 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:55:13.422561 543705 net.go:698] Add success.
I0320 23:55:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:55:14.455145 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:55:14.455157 543705 disk_worker.go:708] disk space is not compliant
W0320 23:55:14.455159 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:55:14.456503 543705 disk_worker.go:494] system disk:vda1
I0320 23:55:14.456546 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:55:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:55:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:55:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:55:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:55:16.472374 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:55:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:55:23.409784 543705 memory.go:184] no items to output this cycle
I0320 23:55:23.409802 543705 cpu.go:275] no items to output this cycle
I0320 23:55:26.723770 543705 disk_info.go:125] begin check local disk info of client
I0320 23:55:26.726304 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:55:26.726310 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fac00 0xc0001fac40]
E0320 23:55:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:55:33.409792 543705 memory.go:184] no items to output this cycle
I0320 23:55:33.409808 543705 cpu.go:275] no items to output this cycle
E0320 23:55:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:55:43.409774 543705 memory.go:191] Add success.
I0320 23:55:43.409806 543705 cpu.go:282] Add success.
I0320 23:55:43.419881 543705 net.go:648] Add success.
I0320 23:55:43.422941 543705 net.go:770] primary dev: ETH0
I0320 23:55:43.422956 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:55:43.422971 543705 net.go:698] Add success.
I0320 23:55:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:55:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:55:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:55:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:55:53.409775 543705 memory.go:184] no items to output this cycle
I0320 23:55:53.409793 543705 cpu.go:275] no items to output this cycle
E0320 23:56:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:56:03.409800 543705 memory.go:184] no items to output this cycle
I0320 23:56:03.409812 543705 cpu.go:275] no items to output this cycle
E0320 23:56:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:56:13.409804 543705 memory.go:191] Add success.
I0320 23:56:13.409824 543705 cpu.go:282] Add success.
W0320 23:56:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:56:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:56:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:56:13.420191 543705 net.go:648] Add success.
I0320 23:56:13.423125 543705 net.go:770] primary dev: ETH0
I0320 23:56:13.423138 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:56:13.423151 543705 net.go:698] Add success.
I0320 23:56:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:56:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:56:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0320 23:56:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:56:14.456518 543705 disk_worker.go:494] system disk:vda1
I0320 23:56:14.456560 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:56:15.455981 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:56:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:56:16.458063 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:56:16.458090 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:56:16.472473 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:56:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:56:23.409777 543705 memory.go:184] no items to output this cycle
I0320 23:56:23.409799 543705 cpu.go:275] no items to output this cycle
I0320 23:56:26.726729 543705 disk_info.go:125] begin check local disk info of client
I0320 23:56:26.729285 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:56:26.729292 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007bcc0 0xc00007bd00]
E0320 23:56:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:56:33.409801 543705 memory.go:184] no items to output this cycle
I0320 23:56:33.409810 543705 cpu.go:275] no items to output this cycle
E0320 23:56:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:56:43.409783 543705 memory.go:191] Add success.
I0320 23:56:43.409802 543705 cpu.go:282] Add success.
I0320 23:56:43.420067 543705 net.go:648] Add success.
I0320 23:56:43.422726 543705 net.go:770] primary dev: ETH0
I0320 23:56:43.422741 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:56:43.422756 543705 net.go:698] Add success.
I0320 23:56:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:56:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:56:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:56:53.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:56:53.409783 543705 cpu.go:275] no items to output this cycle
I0320 23:56:53.409787 543705 memory.go:184] no items to output this cycle
E0320 23:57:03.409857 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:57:03.409876 543705 memory.go:184] no items to output this cycle
I0320 23:57:03.409941 543705 cpu.go:275] no items to output this cycle
E0320 23:57:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:57:13.409791 543705 memory.go:191] Add success.
I0320 23:57:13.409816 543705 cpu.go:282] Add success.
W0320 23:57:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:57:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:57:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:57:13.420145 543705 net.go:648] Add success.
I0320 23:57:13.422772 543705 net.go:770] primary dev: ETH0
I0320 23:57:13.422785 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:57:13.422797 543705 net.go:698] Add success.
I0320 23:57:13.429373 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0320 23:57:13.453557 543705 event_worker.go:152] Polling the log file for events...
I0320 23:57:13.469760 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0cf02da0-1cc0-4a37-b8cc-78ed6255f275","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0320 23:57:13.469793 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0320 23:57:14.455134 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:57:14.455193 543705 disk_worker.go:708] disk space is not compliant
W0320 23:57:14.455196 543705 disk_worker.go:728] disk inode is not compliant
E0320 23:57:14.455877 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0320 23:57:14.455885 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0320 23:57:14.455891 543705 custom_config.go:64] query custom config with name: gpu
I0320 23:57:14.456536 543705 disk_worker.go:494] system disk:vda1
I0320 23:57:14.456565 543705 disk_worker.go:432] add disk info successfully, len:33
E0320 23:57:15.456860 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0320 23:57:15.456869 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:57:16.457916 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0320 23:57:16.457927 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0320 23:57:16.457969 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:57:16.457985 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:57:16.472293 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:57:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:57:23.409789 543705 memory.go:184] no items to output this cycle
I0320 23:57:23.409790 543705 cpu.go:275] no items to output this cycle
I0320 23:57:26.729674 543705 disk_info.go:125] begin check local disk info of client
I0320 23:57:26.732182 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:57:26.732188 543705 disk_info.go:196] parse disk info done, disk is : [0xc000304d80 0xc000304dc0]
E0320 23:57:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:57:33.409803 543705 memory.go:184] no items to output this cycle
I0320 23:57:33.409819 543705 cpu.go:275] no items to output this cycle
I0320 23:57:38.667182 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0320 23:57:38.667188 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0320 23:57:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:57:43.410690 543705 memory.go:191] Add success.
I0320 23:57:43.409816 543705 cpu.go:282] Add success.
I0320 23:57:43.420377 543705 net.go:648] Add success.
I0320 23:57:43.422950 543705 net.go:770] primary dev: ETH0
I0320 23:57:43.422965 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:57:43.422978 543705 net.go:698] Add success.
I0320 23:57:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:57:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:57:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:57:53.409873 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:57:53.409895 543705 memory.go:184] no items to output this cycle
I0320 23:57:53.409964 543705 cpu.go:275] no items to output this cycle
E0320 23:58:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:58:03.409767 543705 memory.go:184] no items to output this cycle
I0320 23:58:03.409808 543705 cpu.go:275] no items to output this cycle
E0320 23:58:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:58:13.409791 543705 memory.go:191] Add success.
W0320 23:58:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:58:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:58:13.409830 543705 cpu.go:282] Add success.
I0320 23:58:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:58:13.420039 543705 net.go:648] Add success.
I0320 23:58:13.422740 543705 net.go:770] primary dev: ETH0
I0320 23:58:13.422754 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:58:13.422766 543705 net.go:698] Add success.
I0320 23:58:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:58:14.455198 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:58:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0320 23:58:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:58:14.456617 543705 disk_worker.go:494] system disk:vda1
I0320 23:58:14.456648 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:58:15.455974 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:58:16.457993 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:58:16.458067 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:58:16.458095 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:58:16.472541 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:58:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:58:23.409781 543705 memory.go:184] no items to output this cycle
I0320 23:58:23.409781 543705 cpu.go:275] no items to output this cycle
I0320 23:58:26.732891 543705 disk_info.go:125] begin check local disk info of client
I0320 23:58:26.735498 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:58:26.735505 543705 disk_info.go:196] parse disk info done, disk is : [0xc000304f00 0xc000304f40]
E0320 23:58:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:58:33.409793 543705 memory.go:184] no items to output this cycle
I0320 23:58:33.409812 543705 cpu.go:275] no items to output this cycle
E0320 23:58:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:58:43.409822 543705 memory.go:191] Add success.
I0320 23:58:43.409827 543705 cpu.go:282] Add success.
I0320 23:58:43.419965 543705 net.go:648] Add success.
I0320 23:58:43.423128 543705 net.go:770] primary dev: ETH0
I0320 23:58:43.423141 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:58:43.423154 543705 net.go:698] Add success.
I0320 23:58:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:58:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:58:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:58:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:58:53.409796 543705 memory.go:184] no items to output this cycle
I0320 23:58:53.409806 543705 cpu.go:275] no items to output this cycle
E0320 23:59:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:59:03.409790 543705 memory.go:184] no items to output this cycle
I0320 23:59:03.409797 543705 cpu.go:275] no items to output this cycle
W0320 23:59:13.409711 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0320 23:59:13.409729 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0320 23:59:13.409733 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0320 23:59:13.409801 543705 cpu.go:282] Add success.
E0320 23:59:13.409831 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:59:13.409853 543705 memory.go:191] Add success.
I0320 23:59:13.420106 543705 net.go:648] Add success.
I0320 23:59:13.423083 543705 net.go:770] primary dev: ETH0
I0320 23:59:13.423098 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:59:13.423113 543705 net.go:698] Add success.
I0320 23:59:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0320 23:59:14.455131 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0320 23:59:14.455216 543705 disk_worker.go:708] disk space is not compliant
W0320 23:59:14.455219 543705 disk_worker.go:728] disk inode is not compliant
I0320 23:59:14.456611 543705 disk_worker.go:494] system disk:vda1
I0320 23:59:14.456640 543705 disk_worker.go:432] add disk info successfully, len:33
I0320 23:59:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0320 23:59:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:59:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:59:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0320 23:59:16.472374 543705 disk_local_worker.go:436] Get disk info: []
E0320 23:59:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:59:23.409787 543705 memory.go:184] no items to output this cycle
I0320 23:59:23.409788 543705 cpu.go:275] no items to output this cycle
I0320 23:59:26.736910 543705 disk_info.go:125] begin check local disk info of client
I0320 23:59:26.739411 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0320 23:59:26.739418 543705 disk_info.go:196] parse disk info done, disk is : [0xc000540f40 0xc000540f80]
E0320 23:59:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:59:33.409790 543705 memory.go:184] no items to output this cycle
I0320 23:59:33.409802 543705 cpu.go:275] no items to output this cycle
E0320 23:59:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:59:43.409778 543705 memory.go:191] Add success.
I0320 23:59:43.409806 543705 cpu.go:282] Add success.
I0320 23:59:43.419892 543705 net.go:648] Add success.
I0320 23:59:43.422383 543705 net.go:770] primary dev: ETH0
I0320 23:59:43.422411 543705 net.go:802] Send network stats successfully!,count is 6
I0320 23:59:43.422425 543705 net.go:698] Add success.
I0320 23:59:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0320 23:59:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0320 23:59:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0320 23:59:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0320 23:59:53.409879 543705 cpu.go:275] no items to output this cycle
I0320 23:59:53.409892 543705 memory.go:184] no items to output this cycle
E0321 00:00:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:00:03.409785 543705 memory.go:184] no items to output this cycle
I0321 00:00:03.409800 543705 cpu.go:275] no items to output this cycle
E0321 00:00:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:00:13.409787 543705 memory.go:191] Add success.
I0321 00:00:13.409810 543705 cpu.go:282] Add success.
W0321 00:00:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:00:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:00:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:00:13.420282 543705 net.go:648] Add success.
I0321 00:00:13.423449 543705 net.go:770] primary dev: ETH0
I0321 00:00:13.423473 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:00:13.423485 543705 net.go:698] Add success.
I0321 00:00:13.464284 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"fda9fe65-c9dc-4c83-924c-a6bd0ac37f99","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 00:00:13.464317 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 00:00:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:00:14.455148 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:00:14.455159 543705 disk_worker.go:708] disk space is not compliant
W0321 00:00:14.455162 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:00:14.456497 543705 disk_worker.go:494] system disk:vda1
I0321 00:00:14.456540 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:00:15.455639 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:00:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:00:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:00:16.458079 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:00:16.472439 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:00:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:00:23.409781 543705 memory.go:184] no items to output this cycle
I0321 00:00:23.409785 543705 cpu.go:275] no items to output this cycle
I0321 00:00:26.739899 543705 disk_info.go:125] begin check local disk info of client
I0321 00:00:26.742447 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:00:26.742454 543705 disk_info.go:196] parse disk info done, disk is : [0xc000331d40 0xc000331d80]
E0321 00:00:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:00:33.409802 543705 memory.go:184] no items to output this cycle
I0321 00:00:33.409810 543705 cpu.go:275] no items to output this cycle
I0321 00:00:38.667330 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 00:00:38.667337 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 00:00:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:00:43.410803 543705 memory.go:191] Add success.
I0321 00:00:43.409795 543705 cpu.go:282] Add success.
I0321 00:00:43.420503 543705 net.go:648] Add success.
I0321 00:00:43.423619 543705 net.go:770] primary dev: ETH0
I0321 00:00:43.423631 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:00:43.423645 543705 net.go:698] Add success.
I0321 00:00:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:00:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:00:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:00:53.409863 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:00:53.409882 543705 memory.go:184] no items to output this cycle
I0321 00:00:53.409959 543705 cpu.go:275] no items to output this cycle
E0321 00:01:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:01:03.409793 543705 memory.go:184] no items to output this cycle
I0321 00:01:03.409802 543705 cpu.go:275] no items to output this cycle
E0321 00:01:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:01:13.409799 543705 memory.go:191] Add success.
I0321 00:01:13.409801 543705 cpu.go:282] Add success.
W0321 00:01:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:01:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:01:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:01:13.420173 543705 net.go:648] Add success.
I0321 00:01:13.422800 543705 net.go:770] primary dev: ETH0
I0321 00:01:13.422814 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:01:13.422828 543705 net.go:698] Add success.
I0321 00:01:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:01:14.455180 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:01:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0321 00:01:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:01:14.456593 543705 disk_worker.go:494] system disk:vda1
I0321 00:01:14.456624 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:01:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:01:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:01:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:01:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:01:16.472443 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:01:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:01:23.409782 543705 memory.go:184] no items to output this cycle
I0321 00:01:23.409794 543705 cpu.go:275] no items to output this cycle
I0321 00:01:26.742731 543705 disk_info.go:125] begin check local disk info of client
I0321 00:01:26.745195 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:01:26.745201 543705 disk_info.go:196] parse disk info done, disk is : [0xc000486740 0xc000486780]
E0321 00:01:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:01:33.409797 543705 memory.go:184] no items to output this cycle
I0321 00:01:33.409812 543705 cpu.go:275] no items to output this cycle
E0321 00:01:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:01:43.409794 543705 memory.go:191] Add success.
I0321 00:01:43.409826 543705 cpu.go:282] Add success.
I0321 00:01:43.419885 543705 net.go:648] Add success.
I0321 00:01:43.422513 543705 net.go:770] primary dev: ETH0
I0321 00:01:43.422526 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:01:43.422540 543705 net.go:698] Add success.
I0321 00:01:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:01:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:01:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:01:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:01:53.409802 543705 memory.go:184] no items to output this cycle
I0321 00:01:53.409817 543705 cpu.go:275] no items to output this cycle
E0321 00:02:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:02:03.409794 543705 memory.go:184] no items to output this cycle
I0321 00:02:03.409801 543705 cpu.go:275] no items to output this cycle
E0321 00:02:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:02:13.409812 543705 memory.go:191] Add success.
I0321 00:02:13.409813 543705 cpu.go:282] Add success.
W0321 00:02:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:02:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:02:13.409854 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:02:13.420139 543705 net.go:648] Add success.
I0321 00:02:13.422936 543705 net.go:770] primary dev: ETH0
I0321 00:02:13.422951 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:02:13.422964 543705 net.go:698] Add success.
W0321 00:02:14.455106 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:02:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0321 00:02:14.455173 543705 disk_worker.go:728] disk inode is not compliant
E0321 00:02:14.455877 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 00:02:14.455885 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 00:02:14.455891 543705 custom_config.go:64] query custom config with name: gpu
I0321 00:02:14.456539 543705 disk_worker.go:494] system disk:vda1
I0321 00:02:14.456569 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 00:02:15.456847 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 00:02:15.456856 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:02:16.458095 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:02:16.458166 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
E0321 00:02:16.458167 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 00:02:16.458187 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:02:16.472570 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:02:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:02:23.409784 543705 memory.go:184] no items to output this cycle
I0321 00:02:23.409790 543705 cpu.go:275] no items to output this cycle
I0321 00:02:26.745674 543705 disk_info.go:125] begin check local disk info of client
I0321 00:02:26.748190 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:02:26.748196 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb980 0xc0001fb9c0]
E0321 00:02:33.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:02:33.409806 543705 memory.go:184] no items to output this cycle
I0321 00:02:33.409818 543705 cpu.go:275] no items to output this cycle
E0321 00:02:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:02:43.409809 543705 memory.go:191] Add success.
I0321 00:02:43.409824 543705 cpu.go:282] Add success.
I0321 00:02:43.420061 543705 net.go:648] Add success.
I0321 00:02:43.422817 543705 net.go:770] primary dev: ETH0
I0321 00:02:43.422830 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:02:43.422842 543705 net.go:698] Add success.
I0321 00:02:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:02:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:02:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:02:53.410251 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:02:53.410382 543705 cpu.go:275] no items to output this cycle
I0321 00:02:53.410396 543705 memory.go:184] no items to output this cycle
E0321 00:03:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:03:03.409777 543705 memory.go:184] no items to output this cycle
I0321 00:03:03.409797 543705 cpu.go:275] no items to output this cycle
E0321 00:03:13.410519 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:03:13.410549 543705 memory.go:191] Add success.
I0321 00:03:13.410563 543705 cpu.go:282] Add success.
W0321 00:03:13.410576 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:03:13.410588 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:03:13.410591 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:03:13.419874 543705 net.go:648] Add success.
I0321 00:03:13.422768 543705 net.go:770] primary dev: ETH0
I0321 00:03:13.422782 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:03:13.422794 543705 net.go:698] Add success.
I0321 00:03:13.571631 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"201d852c-da35-40e2-ba71-1adc763a7ae2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 00:03:13.571662 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 00:03:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:03:14.455180 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:03:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0321 00:03:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:03:14.456682 543705 disk_worker.go:494] system disk:vda1
I0321 00:03:14.456711 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:03:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:03:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:03:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:03:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:03:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:03:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:03:23.409785 543705 memory.go:184] no items to output this cycle
I0321 00:03:23.409786 543705 cpu.go:275] no items to output this cycle
I0321 00:03:26.748929 543705 disk_info.go:125] begin check local disk info of client
I0321 00:03:26.751717 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:03:26.751722 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004614c0 0xc000461500]
E0321 00:03:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:03:33.409770 543705 memory.go:184] no items to output this cycle
I0321 00:03:33.409791 543705 cpu.go:275] no items to output this cycle
I0321 00:03:38.667475 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 00:03:38.667481 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 00:03:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:03:43.410727 543705 memory.go:191] Add success.
I0321 00:03:43.409816 543705 cpu.go:282] Add success.
I0321 00:03:43.420447 543705 net.go:648] Add success.
I0321 00:03:43.423737 543705 net.go:770] primary dev: ETH0
I0321 00:03:43.423751 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:03:43.423766 543705 net.go:698] Add success.
I0321 00:03:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:03:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:03:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:03:53.409907 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:03:53.409935 543705 cpu.go:275] no items to output this cycle
I0321 00:03:53.409942 543705 memory.go:184] no items to output this cycle
E0321 00:04:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:04:03.409786 543705 cpu.go:275] no items to output this cycle
I0321 00:04:03.409789 543705 memory.go:184] no items to output this cycle
E0321 00:04:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:04:13.409805 543705 memory.go:191] Add success.
I0321 00:04:13.409810 543705 cpu.go:282] Add success.
W0321 00:04:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:04:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:04:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:04:13.420197 543705 net.go:648] Add success.
I0321 00:04:13.423220 543705 net.go:770] primary dev: ETH0
I0321 00:04:13.423235 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:04:13.423250 543705 net.go:698] Add success.
I0321 00:04:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:04:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:04:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0321 00:04:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:04:14.456570 543705 disk_worker.go:494] system disk:vda1
I0321 00:04:14.456600 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:04:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:04:16.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:04:16.458066 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:04:16.458094 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:04:16.472495 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:04:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:04:23.409771 543705 memory.go:184] no items to output this cycle
I0321 00:04:23.409804 543705 cpu.go:275] no items to output this cycle
I0321 00:04:26.752944 543705 disk_info.go:125] begin check local disk info of client
I0321 00:04:26.755519 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:04:26.755526 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007adc0 0xc00007ae00]
E0321 00:04:33.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:04:33.409770 543705 memory.go:184] no items to output this cycle
I0321 00:04:33.409792 543705 cpu.go:275] no items to output this cycle
E0321 00:04:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:04:43.409815 543705 memory.go:191] Add success.
I0321 00:04:43.409822 543705 cpu.go:282] Add success.
I0321 00:04:43.419943 543705 net.go:648] Add success.
I0321 00:04:43.423045 543705 net.go:770] primary dev: ETH0
I0321 00:04:43.423058 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:04:43.423070 543705 net.go:698] Add success.
I0321 00:04:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:04:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:04:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:04:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:04:53.409807 543705 memory.go:184] no items to output this cycle
I0321 00:04:53.409818 543705 cpu.go:275] no items to output this cycle
E0321 00:05:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:05:03.409898 543705 memory.go:184] no items to output this cycle
I0321 00:05:03.409932 543705 cpu.go:275] no items to output this cycle
E0321 00:05:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:05:13.409793 543705 memory.go:191] Add success.
W0321 00:05:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0321 00:05:13.409825 543705 cpu.go:282] Add success.
W0321 00:05:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:05:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:05:13.420183 543705 net.go:648] Add success.
I0321 00:05:13.423013 543705 net.go:770] primary dev: ETH0
I0321 00:05:13.423028 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:05:13.423055 543705 net.go:698] Add success.
I0321 00:05:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:05:14.455101 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:05:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0321 00:05:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:05:14.456511 543705 disk_worker.go:494] system disk:vda1
I0321 00:05:14.456560 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:05:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:05:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:05:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:05:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:05:16.472418 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:05:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:05:23.409774 543705 memory.go:184] no items to output this cycle
I0321 00:05:23.409789 543705 cpu.go:275] no items to output this cycle
I0321 00:05:26.757039 543705 disk_info.go:125] begin check local disk info of client
I0321 00:05:26.759559 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:05:26.759574 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007a740 0xc00007a780]
E0321 00:05:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:05:33.409774 543705 memory.go:184] no items to output this cycle
I0321 00:05:33.409779 543705 cpu.go:275] no items to output this cycle
E0321 00:05:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:05:43.409788 543705 memory.go:191] Add success.
I0321 00:05:43.409798 543705 cpu.go:282] Add success.
I0321 00:05:43.419845 543705 net.go:648] Add success.
I0321 00:05:43.422580 543705 net.go:770] primary dev: ETH0
I0321 00:05:43.422593 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:05:43.422606 543705 net.go:698] Add success.
I0321 00:05:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:05:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:05:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:05:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:05:53.409797 543705 memory.go:184] no items to output this cycle
I0321 00:05:53.409809 543705 cpu.go:275] no items to output this cycle
E0321 00:06:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:06:03.409778 543705 memory.go:184] no items to output this cycle
I0321 00:06:03.409780 543705 cpu.go:275] no items to output this cycle
E0321 00:06:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:06:13.409927 543705 cpu.go:282] Add success.
I0321 00:06:13.409961 543705 memory.go:191] Add success.
W0321 00:06:13.409996 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:06:13.410015 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:06:13.410020 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:06:13.419714 543705 net.go:648] Add success.
I0321 00:06:13.423047 543705 net.go:770] primary dev: ETH0
I0321 00:06:13.423060 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:06:13.423072 543705 net.go:698] Add success.
I0321 00:06:13.469145 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c2e7f32b-036c-4478-a72f-220c59080492","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 00:06:13.469186 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 00:06:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:06:14.455197 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:06:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0321 00:06:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:06:14.456780 543705 disk_worker.go:494] system disk:vda1
I0321 00:06:14.456809 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:06:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:06:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:06:16.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:06:16.458080 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:06:16.472445 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:06:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:06:23.409773 543705 memory.go:184] no items to output this cycle
I0321 00:06:23.409795 543705 cpu.go:275] no items to output this cycle
I0321 00:06:26.759633 543705 disk_info.go:125] begin check local disk info of client
I0321 00:06:26.762235 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:06:26.762242 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e79c0 0xc0003e7a00]
E0321 00:06:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:06:33.409777 543705 memory.go:184] no items to output this cycle
I0321 00:06:33.409778 543705 cpu.go:275] no items to output this cycle
I0321 00:06:38.667618 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 00:06:38.667637 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 00:06:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:06:43.410707 543705 memory.go:191] Add success.
I0321 00:06:43.409816 543705 cpu.go:282] Add success.
I0321 00:06:43.420428 543705 net.go:648] Add success.
I0321 00:06:43.423148 543705 net.go:770] primary dev: ETH0
I0321 00:06:43.423163 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:06:43.423177 543705 net.go:698] Add success.
I0321 00:06:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:06:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:06:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:06:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:06:53.409807 543705 memory.go:184] no items to output this cycle
I0321 00:06:53.409821 543705 cpu.go:275] no items to output this cycle
E0321 00:07:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:07:03.409799 543705 memory.go:184] no items to output this cycle
I0321 00:07:03.409816 543705 cpu.go:275] no items to output this cycle
E0321 00:07:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:07:13.409789 543705 memory.go:191] Add success.
W0321 00:07:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0321 00:07:13.409822 543705 cpu.go:282] Add success.
W0321 00:07:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:07:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:07:13.420231 543705 net.go:648] Add success.
I0321 00:07:13.423025 543705 net.go:770] primary dev: ETH0
I0321 00:07:13.423038 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:07:13.423050 543705 net.go:698] Add success.
I0321 00:07:13.453725 543705 event_worker.go:152] Polling the log file for events...
W0321 00:07:14.455116 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:07:14.455352 543705 disk_worker.go:708] disk space is not compliant
W0321 00:07:14.455359 543705 disk_worker.go:728] disk inode is not compliant
E0321 00:07:14.457036 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 00:07:14.457045 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 00:07:14.457051 543705 custom_config.go:64] query custom config with name: gpu
I0321 00:07:14.457301 543705 disk_worker.go:494] system disk:vda1
I0321 00:07:14.457341 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 00:07:15.456855 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 00:07:15.456863 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:07:16.457932 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 00:07:16.457935 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 00:07:16.457988 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:07:16.458007 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:07:16.472328 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:07:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:07:23.409794 543705 memory.go:184] no items to output this cycle
I0321 00:07:23.409796 543705 cpu.go:275] no items to output this cycle
I0321 00:07:26.762739 543705 disk_info.go:125] begin check local disk info of client
I0321 00:07:26.765268 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:07:26.765275 543705 disk_info.go:196] parse disk info done, disk is : [0xc000515940 0xc000515980]
E0321 00:07:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:07:33.409782 543705 memory.go:184] no items to output this cycle
I0321 00:07:33.409793 543705 cpu.go:275] no items to output this cycle
E0321 00:07:43.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:07:43.409783 543705 memory.go:191] Add success.
I0321 00:07:43.409820 543705 cpu.go:282] Add success.
I0321 00:07:43.419855 543705 net.go:648] Add success.
I0321 00:07:43.422686 543705 net.go:770] primary dev: ETH0
I0321 00:07:43.422699 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:07:43.422711 543705 net.go:698] Add success.
I0321 00:07:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:07:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:07:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:07:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:07:53.409789 543705 memory.go:184] no items to output this cycle
I0321 00:07:53.409788 543705 cpu.go:275] no items to output this cycle
E0321 00:08:03.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:08:03.409767 543705 memory.go:184] no items to output this cycle
I0321 00:08:03.409805 543705 cpu.go:275] no items to output this cycle
E0321 00:08:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:08:13.409793 543705 memory.go:191] Add success.
I0321 00:08:13.409821 543705 cpu.go:282] Add success.
W0321 00:08:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:08:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:08:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:08:13.420289 543705 net.go:648] Add success.
I0321 00:08:13.422973 543705 net.go:770] primary dev: ETH0
I0321 00:08:13.422986 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:08:13.422999 543705 net.go:698] Add success.
I0321 00:08:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:08:14.455114 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:08:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0321 00:08:14.455186 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:08:14.456562 543705 disk_worker.go:494] system disk:vda1
I0321 00:08:14.456593 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:08:15.456014 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:08:16.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:08:16.458065 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:08:16.458096 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:08:16.472527 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:08:23.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:08:23.409769 543705 memory.go:184] no items to output this cycle
I0321 00:08:23.409793 543705 cpu.go:275] no items to output this cycle
I0321 00:08:26.765670 543705 disk_info.go:125] begin check local disk info of client
I0321 00:08:26.768492 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:08:26.768497 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa680 0xc0001aa6c0]
E0321 00:08:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:08:33.409780 543705 memory.go:184] no items to output this cycle
I0321 00:08:33.409785 543705 cpu.go:275] no items to output this cycle
E0321 00:08:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:08:43.409783 543705 memory.go:191] Add success.
I0321 00:08:43.409808 543705 cpu.go:282] Add success.
I0321 00:08:43.419835 543705 net.go:648] Add success.
I0321 00:08:43.422443 543705 net.go:770] primary dev: ETH0
I0321 00:08:43.422456 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:08:43.422468 543705 net.go:698] Add success.
I0321 00:08:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:08:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:08:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:08:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:08:53.409806 543705 memory.go:184] no items to output this cycle
I0321 00:08:53.409815 543705 cpu.go:275] no items to output this cycle
E0321 00:09:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:09:03.409776 543705 memory.go:184] no items to output this cycle
I0321 00:09:03.409785 543705 cpu.go:275] no items to output this cycle
E0321 00:09:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:09:13.409794 543705 memory.go:191] Add success.
I0321 00:09:13.409802 543705 cpu.go:282] Add success.
W0321 00:09:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:09:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:09:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:09:13.420163 543705 net.go:648] Add success.
I0321 00:09:13.423095 543705 net.go:770] primary dev: ETH0
I0321 00:09:13.423111 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:09:13.423124 543705 net.go:698] Add success.
I0321 00:09:13.476523 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2fb59ccd-33de-4327-b163-035e9804f567","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 00:09:13.476565 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 00:09:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:09:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:09:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0321 00:09:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:09:14.456738 543705 disk_worker.go:494] system disk:vda1
I0321 00:09:14.456768 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:09:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:09:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:09:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:09:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:09:16.472578 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:09:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:09:23.409793 543705 memory.go:184] no items to output this cycle
I0321 00:09:23.409798 543705 cpu.go:275] no items to output this cycle
I0321 00:09:26.769095 543705 disk_info.go:125] begin check local disk info of client
I0321 00:09:26.771597 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:09:26.771603 543705 disk_info.go:196] parse disk info done, disk is : [0xc00056dac0 0xc00056db00]
E0321 00:09:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:09:33.409765 543705 memory.go:184] no items to output this cycle
I0321 00:09:33.409808 543705 cpu.go:275] no items to output this cycle
I0321 00:09:38.667773 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 00:09:38.667780 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 00:09:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:09:43.410673 543705 memory.go:191] Add success.
I0321 00:09:43.409797 543705 cpu.go:282] Add success.
I0321 00:09:43.420385 543705 net.go:648] Add success.
I0321 00:09:43.422927 543705 net.go:770] primary dev: ETH0
I0321 00:09:43.422940 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:09:43.422953 543705 net.go:698] Add success.
I0321 00:09:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:09:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:09:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:09:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:09:53.409777 543705 memory.go:184] no items to output this cycle
I0321 00:09:53.409796 543705 cpu.go:275] no items to output this cycle
E0321 00:10:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:10:03.409782 543705 memory.go:184] no items to output this cycle
I0321 00:10:03.409783 543705 cpu.go:275] no items to output this cycle
E0321 00:10:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:10:13.409819 543705 memory.go:191] Add success.
I0321 00:10:13.409824 543705 cpu.go:282] Add success.
W0321 00:10:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:10:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:10:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:10:13.420073 543705 net.go:648] Add success.
I0321 00:10:13.422640 543705 net.go:770] primary dev: ETH0
I0321 00:10:13.422655 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:10:13.422670 543705 net.go:698] Add success.
I0321 00:10:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:10:14.455208 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:10:14.455219 543705 disk_worker.go:708] disk space is not compliant
W0321 00:10:14.455222 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:10:14.456609 543705 disk_worker.go:494] system disk:vda1
I0321 00:10:14.456639 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:10:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:10:16.457997 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:10:16.458077 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:10:16.458111 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:10:16.472534 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:10:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:10:23.409781 543705 memory.go:184] no items to output this cycle
I0321 00:10:23.409788 543705 cpu.go:275] no items to output this cycle
I0321 00:10:26.773114 543705 disk_info.go:125] begin check local disk info of client
I0321 00:10:26.775725 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:10:26.775732 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab140 0xc0001ab180]
E0321 00:10:33.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:10:33.409771 543705 memory.go:184] no items to output this cycle
I0321 00:10:33.409777 543705 cpu.go:275] no items to output this cycle
E0321 00:10:43.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:10:43.409818 543705 memory.go:191] Add success.
I0321 00:10:43.409830 543705 cpu.go:282] Add success.
I0321 00:10:43.419957 543705 net.go:648] Add success.
I0321 00:10:43.422723 543705 net.go:770] primary dev: ETH0
I0321 00:10:43.422737 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:10:43.422750 543705 net.go:698] Add success.
I0321 00:10:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:10:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:10:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:10:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:10:53.409782 543705 memory.go:184] no items to output this cycle
I0321 00:10:53.409794 543705 cpu.go:275] no items to output this cycle
E0321 00:11:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:11:03.409794 543705 memory.go:184] no items to output this cycle
I0321 00:11:03.409805 543705 cpu.go:275] no items to output this cycle
E0321 00:11:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:11:13.409824 543705 memory.go:191] Add success.
I0321 00:11:13.409831 543705 cpu.go:282] Add success.
W0321 00:11:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:11:13.409873 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:11:13.409879 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:11:13.420076 543705 net.go:648] Add success.
I0321 00:11:13.422995 543705 net.go:770] primary dev: ETH0
I0321 00:11:13.423009 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:11:13.423027 543705 net.go:698] Add success.
I0321 00:11:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:11:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:11:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0321 00:11:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:11:14.456533 543705 disk_worker.go:494] system disk:vda1
I0321 00:11:14.456576 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:11:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:11:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:11:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:11:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:11:16.472439 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:11:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:11:23.409787 543705 memory.go:184] no items to output this cycle
I0321 00:11:23.409790 543705 cpu.go:275] no items to output this cycle
I0321 00:11:26.777043 543705 disk_info.go:125] begin check local disk info of client
I0321 00:11:26.779595 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:11:26.779601 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004f8380 0xc0004f83c0]
E0321 00:11:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:11:33.409781 543705 memory.go:184] no items to output this cycle
I0321 00:11:33.409789 543705 cpu.go:275] no items to output this cycle
E0321 00:11:43.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:11:43.409833 543705 memory.go:191] Add success.
I0321 00:11:43.409851 543705 cpu.go:282] Add success.
I0321 00:11:43.420187 543705 net.go:648] Add success.
I0321 00:11:43.422969 543705 net.go:770] primary dev: ETH0
I0321 00:11:43.422984 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:11:43.422999 543705 net.go:698] Add success.
I0321 00:11:46.458027 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:11:46.458091 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:11:46.458125 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:11:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:11:53.409780 543705 memory.go:184] no items to output this cycle
I0321 00:11:53.409783 543705 cpu.go:275] no items to output this cycle
E0321 00:12:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:12:03.409775 543705 memory.go:184] no items to output this cycle
I0321 00:12:03.409785 543705 cpu.go:275] no items to output this cycle
E0321 00:12:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:12:13.409794 543705 memory.go:191] Add success.
I0321 00:12:13.409798 543705 cpu.go:282] Add success.
W0321 00:12:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:12:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:12:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:12:13.420051 543705 net.go:648] Add success.
I0321 00:12:13.423103 543705 net.go:770] primary dev: ETH0
I0321 00:12:13.423116 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:12:13.423129 543705 net.go:698] Add success.
I0321 00:12:13.464367 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"cd2a5138-c45e-4866-9010-44e488debad9","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 00:12:13.464406 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0321 00:12:14.455233 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:12:14.455246 543705 disk_worker.go:708] disk space is not compliant
W0321 00:12:14.455250 543705 disk_worker.go:728] disk inode is not compliant
E0321 00:12:14.455922 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 00:12:14.455932 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 00:12:14.455938 543705 custom_config.go:64] query custom config with name: gpu
I0321 00:12:14.456847 543705 disk_worker.go:494] system disk:vda1
I0321 00:12:14.456878 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 00:12:15.456827 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 00:12:15.456838 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:12:16.458099 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:12:16.458158 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:12:16.458181 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:12:16.458187 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 00:12:16.472570 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:12:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:12:23.409769 543705 memory.go:184] no items to output this cycle
I0321 00:12:23.409789 543705 cpu.go:275] no items to output this cycle
I0321 00:12:26.781186 543705 disk_info.go:125] begin check local disk info of client
I0321 00:12:26.783733 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:12:26.783739 543705 disk_info.go:196] parse disk info done, disk is : [0xc000274d40 0xc000274d80]
E0321 00:12:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:12:33.409801 543705 memory.go:184] no items to output this cycle
I0321 00:12:33.409815 543705 cpu.go:275] no items to output this cycle
I0321 00:12:38.667916 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 00:12:38.667923 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 00:12:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:12:43.410617 543705 memory.go:191] Add success.
I0321 00:12:43.409799 543705 cpu.go:282] Add success.
I0321 00:12:43.420326 543705 net.go:648] Add success.
I0321 00:12:43.423029 543705 net.go:770] primary dev: ETH0
I0321 00:12:43.423044 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:12:43.423059 543705 net.go:698] Add success.
I0321 00:12:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:12:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:12:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:12:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:12:53.409777 543705 memory.go:184] no items to output this cycle
I0321 00:12:53.409779 543705 cpu.go:275] no items to output this cycle
E0321 00:13:03.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:13:03.409763 543705 memory.go:184] no items to output this cycle
I0321 00:13:03.409801 543705 cpu.go:275] no items to output this cycle
E0321 00:13:13.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:13:13.409836 543705 memory.go:191] Add success.
I0321 00:13:13.409840 543705 cpu.go:282] Add success.
W0321 00:13:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:13:13.409884 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:13:13.409888 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:13:13.420701 543705 net.go:648] Add success.
I0321 00:13:13.423581 543705 net.go:770] primary dev: ETH0
I0321 00:13:13.423594 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:13:13.423605 543705 net.go:698] Add success.
I0321 00:13:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:13:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:13:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0321 00:13:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:13:14.456581 543705 disk_worker.go:494] system disk:vda1
I0321 00:13:14.456609 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:13:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:13:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:13:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:13:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:13:16.472378 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:13:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:13:23.409781 543705 memory.go:184] no items to output this cycle
I0321 00:13:23.409782 543705 cpu.go:275] no items to output this cycle
I0321 00:13:26.785076 543705 disk_info.go:125] begin check local disk info of client
I0321 00:13:26.787830 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:13:26.787836 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ab9c0 0xc0003aba00]
E0321 00:13:33.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:13:33.409764 543705 memory.go:184] no items to output this cycle
I0321 00:13:33.409799 543705 cpu.go:275] no items to output this cycle
E0321 00:13:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:13:43.409800 543705 memory.go:191] Add success.
I0321 00:13:43.409809 543705 cpu.go:282] Add success.
I0321 00:13:43.419861 543705 net.go:648] Add success.
I0321 00:13:43.423406 543705 net.go:770] primary dev: ETH0
I0321 00:13:43.423419 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:13:43.423431 543705 net.go:698] Add success.
I0321 00:13:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:13:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:13:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:13:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:13:53.409777 543705 memory.go:184] no items to output this cycle
I0321 00:13:53.409794 543705 cpu.go:275] no items to output this cycle
E0321 00:14:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:14:03.409798 543705 memory.go:184] no items to output this cycle
I0321 00:14:03.409805 543705 cpu.go:275] no items to output this cycle
E0321 00:14:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:14:13.409789 543705 memory.go:191] Add success.
W0321 00:14:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0321 00:14:13.409818 543705 cpu.go:282] Add success.
W0321 00:14:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:14:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:14:13.420166 543705 net.go:648] Add success.
I0321 00:14:13.422883 543705 net.go:770] primary dev: ETH0
I0321 00:14:13.422896 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:14:13.422907 543705 net.go:698] Add success.
I0321 00:14:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:14:14.455200 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:14:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0321 00:14:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:14:14.456614 543705 disk_worker.go:494] system disk:vda1
I0321 00:14:14.456646 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:14:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:14:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:14:16.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:14:16.458081 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:14:16.472681 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:14:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:14:23.409778 543705 memory.go:184] no items to output this cycle
I0321 00:14:23.409796 543705 cpu.go:275] no items to output this cycle
I0321 00:14:26.789172 543705 disk_info.go:125] begin check local disk info of client
I0321 00:14:26.791710 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:14:26.791716 543705 disk_info.go:196] parse disk info done, disk is : [0xc000474800 0xc000474840]
I0321 00:14:33.409884 543705 cpu.go:275] no items to output this cycle
E0321 00:14:33.409884 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:14:33.409905 543705 memory.go:184] no items to output this cycle
E0321 00:14:43.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:14:43.409777 543705 memory.go:191] Add success.
I0321 00:14:43.410140 543705 cpu.go:282] Add success.
I0321 00:14:43.420044 543705 net.go:648] Add success.
I0321 00:14:43.421062 543705 net.go:770] primary dev: ETH0
I0321 00:14:43.421074 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:14:43.421086 543705 net.go:698] Add success.
I0321 00:14:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:14:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:14:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:14:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:14:53.409766 543705 memory.go:184] no items to output this cycle
I0321 00:14:53.409785 543705 cpu.go:275] no items to output this cycle
E0321 00:15:03.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:15:03.409763 543705 memory.go:184] no items to output this cycle
I0321 00:15:03.409798 543705 cpu.go:275] no items to output this cycle
E0321 00:15:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:15:13.409816 543705 memory.go:191] Add success.
I0321 00:15:13.409824 543705 cpu.go:282] Add success.
W0321 00:15:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:15:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:15:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:15:13.420062 543705 net.go:648] Add success.
I0321 00:15:13.422528 543705 net.go:770] primary dev: ETH0
I0321 00:15:13.422540 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:15:13.422554 543705 net.go:698] Add success.
I0321 00:15:13.468922 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"178d2553-ea46-4d8e-9c34-c6abd82897f1","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 00:15:13.468957 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 00:15:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:15:14.455109 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:15:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0321 00:15:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:15:14.456684 543705 disk_worker.go:494] system disk:vda1
I0321 00:15:14.456714 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:15:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:15:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:15:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:15:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:15:16.472371 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:15:23.409847 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:15:23.409874 543705 memory.go:184] no items to output this cycle
I0321 00:15:23.409971 543705 cpu.go:275] no items to output this cycle
I0321 00:15:26.793155 543705 disk_info.go:125] begin check local disk info of client
I0321 00:15:26.795736 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:15:26.795742 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035f180 0xc00035f1c0]
E0321 00:15:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:15:33.409783 543705 memory.go:184] no items to output this cycle
I0321 00:15:33.409790 543705 cpu.go:275] no items to output this cycle
I0321 00:15:38.669034 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 00:15:38.669040 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 00:15:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:15:43.410544 543705 memory.go:191] Add success.
I0321 00:15:43.409826 543705 cpu.go:282] Add success.
I0321 00:15:43.420256 543705 net.go:648] Add success.
I0321 00:15:43.422889 543705 net.go:770] primary dev: ETH0
I0321 00:15:43.422904 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:15:43.422919 543705 net.go:698] Add success.
I0321 00:15:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:15:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:15:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:15:53.410263 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:15:53.410276 543705 cpu.go:275] no items to output this cycle
I0321 00:15:53.410278 543705 memory.go:184] no items to output this cycle
E0321 00:16:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:16:03.409798 543705 memory.go:184] no items to output this cycle
I0321 00:16:03.409812 543705 cpu.go:275] no items to output this cycle
E0321 00:16:13.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:16:13.409799 543705 memory.go:191] Add success.
I0321 00:16:13.409801 543705 cpu.go:282] Add success.
W0321 00:16:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:16:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:16:13.409844 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:16:13.420101 543705 net.go:648] Add success.
I0321 00:16:13.422571 543705 net.go:770] primary dev: ETH0
I0321 00:16:13.422584 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:16:13.422596 543705 net.go:698] Add success.
I0321 00:16:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:16:14.455097 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:16:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0321 00:16:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:16:14.456520 543705 disk_worker.go:494] system disk:vda1
I0321 00:16:14.456566 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:16:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:16:16.458015 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:16:16.458079 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:16:16.458109 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:16:16.472578 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:16:23.409869 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:16:23.409894 543705 memory.go:184] no items to output this cycle
I0321 00:16:23.409904 543705 cpu.go:275] no items to output this cycle
I0321 00:16:26.797226 543705 disk_info.go:125] begin check local disk info of client
I0321 00:16:26.800042 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:16:26.800048 543705 disk_info.go:196] parse disk info done, disk is : [0xc000487c40 0xc000487c80]
E0321 00:16:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:16:33.409773 543705 memory.go:184] no items to output this cycle
I0321 00:16:33.409787 543705 cpu.go:275] no items to output this cycle
E0321 00:16:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:16:43.409792 543705 cpu.go:282] Add success.
I0321 00:16:43.409793 543705 memory.go:191] Add success.
I0321 00:16:43.419887 543705 net.go:648] Add success.
I0321 00:16:43.422728 543705 net.go:770] primary dev: ETH0
I0321 00:16:43.422742 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:16:43.422755 543705 net.go:698] Add success.
I0321 00:16:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:16:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:16:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:16:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:16:53.409771 543705 memory.go:184] no items to output this cycle
I0321 00:16:53.409783 543705 cpu.go:275] no items to output this cycle
E0321 00:17:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:17:03.409790 543705 memory.go:184] no items to output this cycle
I0321 00:17:03.409806 543705 cpu.go:275] no items to output this cycle
E0321 00:17:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:17:13.409783 543705 memory.go:191] Add success.
I0321 00:17:13.409806 543705 cpu.go:282] Add success.
W0321 00:17:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:17:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:17:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:17:13.420158 543705 net.go:648] Add success.
I0321 00:17:13.422989 543705 net.go:770] primary dev: ETH0
I0321 00:17:13.423004 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:17:13.423018 543705 net.go:698] Add success.
I0321 00:17:13.453590 543705 event_worker.go:152] Polling the log file for events...
W0321 00:17:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:17:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0321 00:17:14.455180 543705 disk_worker.go:728] disk inode is not compliant
E0321 00:17:14.455913 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 00:17:14.455922 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 00:17:14.455928 543705 custom_config.go:64] query custom config with name: gpu
I0321 00:17:14.456538 543705 disk_worker.go:494] system disk:vda1
I0321 00:17:14.456576 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 00:17:15.456838 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 00:17:15.456847 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:17:16.457955 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 00:17:16.457969 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 00:17:16.458009 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:17:16.458027 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:17:16.472349 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:17:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:17:23.409781 543705 memory.go:184] no items to output this cycle
I0321 00:17:23.409784 543705 cpu.go:275] no items to output this cycle
I0321 00:17:26.801200 543705 disk_info.go:125] begin check local disk info of client
I0321 00:17:26.803717 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:17:26.803724 543705 disk_info.go:196] parse disk info done, disk is : [0xc000493dc0 0xc000493e00]
E0321 00:17:33.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:17:33.409762 543705 memory.go:184] no items to output this cycle
I0321 00:17:33.409801 543705 cpu.go:275] no items to output this cycle
E0321 00:17:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:17:43.409792 543705 memory.go:191] Add success.
I0321 00:17:43.409794 543705 cpu.go:282] Add success.
I0321 00:17:43.419905 543705 net.go:648] Add success.
I0321 00:17:43.422732 543705 net.go:770] primary dev: ETH0
I0321 00:17:43.422747 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:17:43.422761 543705 net.go:698] Add success.
I0321 00:17:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:17:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:17:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:17:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:17:53.409780 543705 memory.go:184] no items to output this cycle
I0321 00:17:53.409781 543705 cpu.go:275] no items to output this cycle
E0321 00:18:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:18:03.409782 543705 memory.go:184] no items to output this cycle
I0321 00:18:03.409797 543705 cpu.go:275] no items to output this cycle
E0321 00:18:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:18:13.409791 543705 memory.go:191] Add success.
I0321 00:18:13.409813 543705 cpu.go:282] Add success.
W0321 00:18:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:18:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:18:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:18:13.420068 543705 net.go:648] Add success.
I0321 00:18:13.422788 543705 net.go:770] primary dev: ETH0
I0321 00:18:13.422800 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:18:13.422813 543705 net.go:698] Add success.
I0321 00:18:13.464467 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"4e7f9eb7-c0a6-4e25-b4aa-18ef7de64fdf","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 00:18:13.464502 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 00:18:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:18:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:18:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0321 00:18:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:18:14.456564 543705 disk_worker.go:494] system disk:vda1
I0321 00:18:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:18:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:18:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:18:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:18:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:18:16.472521 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:18:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:18:23.409804 543705 memory.go:184] no items to output this cycle
I0321 00:18:23.409810 543705 cpu.go:275] no items to output this cycle
I0321 00:18:26.805281 543705 disk_info.go:125] begin check local disk info of client
I0321 00:18:26.807896 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:18:26.807903 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003704c0 0xc000370500]
E0321 00:18:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:18:33.409797 543705 memory.go:184] no items to output this cycle
I0321 00:18:33.409812 543705 cpu.go:275] no items to output this cycle
I0321 00:18:38.669732 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 00:18:38.669739 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 00:18:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:18:43.410605 543705 memory.go:191] Add success.
I0321 00:18:43.409810 543705 cpu.go:282] Add success.
I0321 00:18:43.420286 543705 net.go:648] Add success.
I0321 00:18:43.422905 543705 net.go:770] primary dev: ETH0
I0321 00:18:43.422917 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:18:43.422931 543705 net.go:698] Add success.
I0321 00:18:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:18:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:18:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:18:53.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:18:53.409806 543705 memory.go:184] no items to output this cycle
I0321 00:18:53.409817 543705 cpu.go:275] no items to output this cycle
E0321 00:19:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:19:03.409775 543705 memory.go:184] no items to output this cycle
I0321 00:19:03.409777 543705 cpu.go:275] no items to output this cycle
E0321 00:19:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:19:13.409792 543705 memory.go:191] Add success.
I0321 00:19:13.409809 543705 cpu.go:282] Add success.
W0321 00:19:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:19:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:19:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:19:13.420258 543705 net.go:648] Add success.
I0321 00:19:13.422957 543705 net.go:770] primary dev: ETH0
I0321 00:19:13.422971 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:19:13.422982 543705 net.go:698] Add success.
I0321 00:19:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:19:14.455095 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:19:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0321 00:19:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:19:14.456588 543705 disk_worker.go:494] system disk:vda1
I0321 00:19:14.456616 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:19:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:19:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:19:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:19:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:19:16.472445 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:19:23.410374 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:19:23.410405 543705 memory.go:184] no items to output this cycle
I0321 00:19:23.410440 543705 cpu.go:275] no items to output this cycle
I0321 00:19:26.809234 543705 disk_info.go:125] begin check local disk info of client
I0321 00:19:26.811740 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:19:26.811746 543705 disk_info.go:196] parse disk info done, disk is : [0xc000270140 0xc000270180]
E0321 00:19:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:19:33.409788 543705 memory.go:184] no items to output this cycle
I0321 00:19:33.409803 543705 cpu.go:275] no items to output this cycle
E0321 00:19:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:19:43.409810 543705 memory.go:191] Add success.
I0321 00:19:43.409818 543705 cpu.go:282] Add success.
I0321 00:19:43.419878 543705 net.go:648] Add success.
I0321 00:19:43.423117 543705 net.go:770] primary dev: ETH0
I0321 00:19:43.423130 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:19:43.423143 543705 net.go:698] Add success.
I0321 00:19:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:19:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:19:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:19:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:19:53.409771 543705 memory.go:184] no items to output this cycle
I0321 00:19:53.409789 543705 cpu.go:275] no items to output this cycle
E0321 00:20:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:20:03.409775 543705 memory.go:184] no items to output this cycle
I0321 00:20:03.409780 543705 cpu.go:275] no items to output this cycle
E0321 00:20:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:20:13.409793 543705 memory.go:191] Add success.
I0321 00:20:13.409814 543705 cpu.go:282] Add success.
W0321 00:20:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:20:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:20:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:20:13.420154 543705 net.go:648] Add success.
I0321 00:20:13.422733 543705 net.go:770] primary dev: ETH0
I0321 00:20:13.422746 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:20:13.422758 543705 net.go:698] Add success.
I0321 00:20:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:20:14.455157 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:20:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0321 00:20:14.455170 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:20:14.456515 543705 disk_worker.go:494] system disk:vda1
I0321 00:20:14.456562 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:20:15.456019 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:20:16.457998 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:20:16.458066 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:20:16.458096 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:20:16.472499 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:20:23.409799 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:20:23.409819 543705 memory.go:184] no items to output this cycle
I0321 00:20:23.409831 543705 cpu.go:275] no items to output this cycle
I0321 00:20:26.813285 543705 disk_info.go:125] begin check local disk info of client
I0321 00:20:26.815855 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:20:26.815861 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5480 0xc0000c54c0]
E0321 00:20:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:20:33.409776 543705 memory.go:184] no items to output this cycle
I0321 00:20:33.409804 543705 cpu.go:275] no items to output this cycle
E0321 00:20:43.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:20:43.409827 543705 memory.go:191] Add success.
I0321 00:20:43.409838 543705 cpu.go:282] Add success.
I0321 00:20:43.420001 543705 net.go:648] Add success.
I0321 00:20:43.423062 543705 net.go:770] primary dev: ETH0
I0321 00:20:43.423078 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:20:43.423093 543705 net.go:698] Add success.
I0321 00:20:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:20:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:20:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:20:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:20:53.409783 543705 memory.go:184] no items to output this cycle
I0321 00:20:53.409798 543705 cpu.go:275] no items to output this cycle
E0321 00:21:03.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:21:03.409791 543705 cpu.go:275] no items to output this cycle
I0321 00:21:03.409793 543705 memory.go:184] no items to output this cycle
E0321 00:21:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:21:13.409804 543705 memory.go:191] Add success.
W0321 00:21:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0321 00:21:13.409840 543705 cpu.go:282] Add success.
W0321 00:21:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:21:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:21:13.420213 543705 net.go:648] Add success.
I0321 00:21:13.422884 543705 net.go:770] primary dev: ETH0
I0321 00:21:13.422898 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:21:13.422909 543705 net.go:698] Add success.
I0321 00:21:13.469133 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e70a0bda-339d-4b9b-b795-1412c34ba111","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 00:21:13.469168 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 00:21:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:21:14.455372 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:21:14.455463 543705 disk_worker.go:708] disk space is not compliant
W0321 00:21:14.455468 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:21:14.457122 543705 disk_worker.go:494] system disk:vda1
I0321 00:21:14.457152 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:21:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:21:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:21:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:21:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:21:16.472381 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:21:23.410407 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:21:23.410426 543705 memory.go:184] no items to output this cycle
I0321 00:21:23.410444 543705 cpu.go:275] no items to output this cycle
I0321 00:21:26.817283 543705 disk_info.go:125] begin check local disk info of client
I0321 00:21:26.819816 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:21:26.819822 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002a5d40 0xc0002a5d80]
E0321 00:21:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:21:33.409791 543705 memory.go:184] no items to output this cycle
I0321 00:21:33.409803 543705 cpu.go:275] no items to output this cycle
I0321 00:21:38.671050 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 00:21:38.671058 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 00:21:43.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:21:43.409794 543705 memory.go:191] Add success.
I0321 00:21:43.409805 543705 cpu.go:282] Add success.
I0321 00:21:43.419995 543705 net.go:648] Add success.
I0321 00:21:43.420868 543705 net.go:770] primary dev: ETH0
I0321 00:21:43.420881 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:21:43.420894 543705 net.go:698] Add success.
I0321 00:21:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:21:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:21:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:21:53.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:21:53.409810 543705 memory.go:184] no items to output this cycle
I0321 00:21:53.409824 543705 cpu.go:275] no items to output this cycle
E0321 00:22:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:22:03.409768 543705 memory.go:184] no items to output this cycle
I0321 00:22:03.409803 543705 cpu.go:275] no items to output this cycle
E0321 00:22:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:22:13.409820 543705 memory.go:191] Add success.
I0321 00:22:13.409826 543705 cpu.go:282] Add success.
W0321 00:22:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:22:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:22:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:22:13.420156 543705 net.go:648] Add success.
I0321 00:22:13.423269 543705 net.go:770] primary dev: ETH0
I0321 00:22:13.423284 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:22:13.423460 543705 net.go:698] Add success.
W0321 00:22:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:22:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0321 00:22:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:22:14.456805 543705 disk_worker.go:494] system disk:vda1
I0321 00:22:14.456843 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 00:22:14.457156 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 00:22:14.457164 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 00:22:14.457168 543705 custom_config.go:64] query custom config with name: gpu
E0321 00:22:15.456784 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 00:22:15.456791 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:22:16.457918 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 00:22:16.457918 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 00:22:16.457987 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:22:16.458006 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:22:16.472382 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:22:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:22:23.409807 543705 memory.go:184] no items to output this cycle
I0321 00:22:23.409814 543705 cpu.go:275] no items to output this cycle
I0321 00:22:26.821251 543705 disk_info.go:125] begin check local disk info of client
I0321 00:22:26.823813 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:22:26.823820 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b84c0 0xc0002b8500]
E0321 00:22:33.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:22:33.409804 543705 memory.go:184] no items to output this cycle
I0321 00:22:33.409815 543705 cpu.go:275] no items to output this cycle
E0321 00:22:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:22:43.409783 543705 memory.go:191] Add success.
I0321 00:22:43.409805 543705 cpu.go:282] Add success.
I0321 00:22:43.419889 543705 net.go:648] Add success.
I0321 00:22:43.422995 543705 net.go:770] primary dev: ETH0
I0321 00:22:43.423008 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:22:43.423020 543705 net.go:698] Add success.
I0321 00:22:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:22:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:22:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:22:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:22:53.409778 543705 memory.go:184] no items to output this cycle
I0321 00:22:53.409782 543705 cpu.go:275] no items to output this cycle
E0321 00:23:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:23:03.409783 543705 memory.go:184] no items to output this cycle
I0321 00:23:03.409784 543705 cpu.go:275] no items to output this cycle
E0321 00:23:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:23:13.409805 543705 memory.go:191] Add success.
I0321 00:23:13.409808 543705 cpu.go:282] Add success.
W0321 00:23:13.409891 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:23:13.409962 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:23:13.409967 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:23:13.419718 543705 net.go:648] Add success.
I0321 00:23:13.422259 543705 net.go:770] primary dev: ETH0
I0321 00:23:13.422275 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:23:13.422289 543705 net.go:698] Add success.
I0321 00:23:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:23:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:23:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0321 00:23:14.455202 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:23:14.456578 543705 disk_worker.go:494] system disk:vda1
I0321 00:23:14.456606 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:23:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:23:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:23:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:23:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:23:16.472405 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:23:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:23:23.409781 543705 memory.go:184] no items to output this cycle
I0321 00:23:23.409786 543705 cpu.go:275] no items to output this cycle
I0321 00:23:26.825341 543705 disk_info.go:125] begin check local disk info of client
I0321 00:23:26.827847 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:23:26.827853 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5bc0 0xc0000c5c00]
E0321 00:23:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:23:33.409796 543705 memory.go:184] no items to output this cycle
I0321 00:23:33.409809 543705 cpu.go:275] no items to output this cycle
E0321 00:23:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:23:43.409787 543705 memory.go:191] Add success.
I0321 00:23:43.409810 543705 cpu.go:282] Add success.
I0321 00:23:43.419855 543705 net.go:648] Add success.
I0321 00:23:43.422646 543705 net.go:770] primary dev: ETH0
I0321 00:23:43.422662 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:23:43.422677 543705 net.go:698] Add success.
I0321 00:23:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:23:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:23:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:23:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:23:53.409772 543705 memory.go:184] no items to output this cycle
I0321 00:23:53.409794 543705 cpu.go:275] no items to output this cycle
E0321 00:24:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:24:03.409867 543705 memory.go:184] no items to output this cycle
I0321 00:24:03.409910 543705 cpu.go:275] no items to output this cycle
E0321 00:24:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:24:13.409806 543705 memory.go:191] Add success.
I0321 00:24:13.409808 543705 cpu.go:282] Add success.
W0321 00:24:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:24:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:24:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:24:13.420181 543705 net.go:648] Add success.
I0321 00:24:13.422890 543705 net.go:770] primary dev: ETH0
I0321 00:24:13.422905 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:24:13.422919 543705 net.go:698] Add success.
I0321 00:24:13.469682 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"71361961-5e6c-4da8-863d-b45b640d9a95","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 00:24:13.469715 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 00:24:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:24:14.455200 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:24:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0321 00:24:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:24:14.456622 543705 disk_worker.go:494] system disk:vda1
I0321 00:24:14.456652 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:24:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:24:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:24:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:24:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:24:16.472395 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:24:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:24:23.409785 543705 memory.go:184] no items to output this cycle
I0321 00:24:23.409785 543705 cpu.go:275] no items to output this cycle
I0321 00:24:26.829388 543705 disk_info.go:125] begin check local disk info of client
I0321 00:24:26.831996 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:24:26.832003 543705 disk_info.go:196] parse disk info done, disk is : [0xc00056d300 0xc00056d340]
E0321 00:24:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:24:33.409776 543705 memory.go:184] no items to output this cycle
I0321 00:24:33.409782 543705 cpu.go:275] no items to output this cycle
I0321 00:24:38.671211 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 00:24:38.671218 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 00:24:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:24:43.410616 543705 memory.go:191] Add success.
I0321 00:24:43.409822 543705 cpu.go:282] Add success.
I0321 00:24:43.420328 543705 net.go:648] Add success.
I0321 00:24:43.423006 543705 net.go:770] primary dev: ETH0
I0321 00:24:43.423021 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:24:43.423041 543705 net.go:698] Add success.
I0321 00:24:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:24:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:24:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:24:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:24:53.409765 543705 memory.go:184] no items to output this cycle
I0321 00:24:53.409806 543705 cpu.go:275] no items to output this cycle
E0321 00:25:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:25:03.409793 543705 memory.go:184] no items to output this cycle
I0321 00:25:03.409814 543705 cpu.go:275] no items to output this cycle
E0321 00:25:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:25:13.409790 543705 memory.go:191] Add success.
W0321 00:25:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0321 00:25:13.409824 543705 cpu.go:282] Add success.
W0321 00:25:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:25:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:25:13.420144 543705 net.go:648] Add success.
I0321 00:25:13.422749 543705 net.go:770] primary dev: ETH0
I0321 00:25:13.422762 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:25:13.422774 543705 net.go:698] Add success.
I0321 00:25:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:25:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:25:14.455219 543705 disk_worker.go:708] disk space is not compliant
W0321 00:25:14.455222 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:25:14.456621 543705 disk_worker.go:494] system disk:vda1
I0321 00:25:14.456658 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:25:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:25:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:25:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:25:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:25:16.472416 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:25:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:25:23.409783 543705 memory.go:184] no items to output this cycle
I0321 00:25:23.409796 543705 cpu.go:275] no items to output this cycle
I0321 00:25:26.833414 543705 disk_info.go:125] begin check local disk info of client
I0321 00:25:26.836184 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:25:26.836199 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001f9e00 0xc0001f9e40]
E0321 00:25:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:25:33.409782 543705 memory.go:184] no items to output this cycle
I0321 00:25:33.409785 543705 cpu.go:275] no items to output this cycle
E0321 00:25:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:25:43.409789 543705 memory.go:191] Add success.
I0321 00:25:43.409803 543705 cpu.go:282] Add success.
I0321 00:25:43.419872 543705 net.go:648] Add success.
I0321 00:25:43.422525 543705 net.go:770] primary dev: ETH0
I0321 00:25:43.422539 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:25:43.422552 543705 net.go:698] Add success.
I0321 00:25:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:25:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:25:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:25:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:25:53.409799 543705 memory.go:184] no items to output this cycle
I0321 00:25:53.409811 543705 cpu.go:275] no items to output this cycle
E0321 00:26:03.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:26:03.409762 543705 memory.go:184] no items to output this cycle
I0321 00:26:03.409805 543705 cpu.go:275] no items to output this cycle
E0321 00:26:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:26:13.409798 543705 memory.go:191] Add success.
I0321 00:26:13.409799 543705 cpu.go:282] Add success.
W0321 00:26:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:26:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:26:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:26:13.420178 543705 net.go:648] Add success.
I0321 00:26:13.423233 543705 net.go:770] primary dev: ETH0
I0321 00:26:13.423246 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:26:13.423271 543705 net.go:698] Add success.
I0321 00:26:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:26:14.455203 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:26:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0321 00:26:14.455217 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:26:14.456620 543705 disk_worker.go:494] system disk:vda1
I0321 00:26:14.456652 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:26:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:26:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:26:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:26:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:26:16.472385 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:26:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:26:23.409783 543705 memory.go:184] no items to output this cycle
I0321 00:26:23.409787 543705 cpu.go:275] no items to output this cycle
I0321 00:26:26.837429 543705 disk_info.go:125] begin check local disk info of client
I0321 00:26:26.840018 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:26:26.840025 543705 disk_info.go:196] parse disk info done, disk is : [0xc000340380 0xc0003403c0]
E0321 00:26:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:26:33.409797 543705 memory.go:184] no items to output this cycle
I0321 00:26:33.409809 543705 cpu.go:275] no items to output this cycle
E0321 00:26:43.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:26:43.409773 543705 memory.go:191] Add success.
I0321 00:26:43.409804 543705 cpu.go:282] Add success.
I0321 00:26:43.420103 543705 net.go:648] Add success.
I0321 00:26:43.422714 543705 net.go:770] primary dev: ETH0
I0321 00:26:43.422727 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:26:43.422738 543705 net.go:698] Add success.
I0321 00:26:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:26:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:26:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:26:53.410346 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:26:53.410365 543705 memory.go:184] no items to output this cycle
I0321 00:26:53.410382 543705 cpu.go:275] no items to output this cycle
E0321 00:27:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:27:03.409768 543705 memory.go:184] no items to output this cycle
I0321 00:27:03.409795 543705 cpu.go:275] no items to output this cycle
E0321 00:27:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:27:13.409798 543705 memory.go:191] Add success.
I0321 00:27:13.409800 543705 cpu.go:282] Add success.
W0321 00:27:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:27:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:27:13.409840 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:27:13.420211 543705 net.go:648] Add success.
I0321 00:27:13.423134 543705 net.go:770] primary dev: ETH0
I0321 00:27:13.423149 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:27:13.423167 543705 net.go:698] Add success.
I0321 00:27:13.429897 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0321 00:27:13.453073 543705 event_worker.go:152] Polling the log file for events...
I0321 00:27:13.469178 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"116f1dd6-1698-477d-b481-b04c529b67f3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 00:27:13.469210 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0321 00:27:14.455234 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:27:14.455249 543705 disk_worker.go:708] disk space is not compliant
W0321 00:27:14.455254 543705 disk_worker.go:728] disk inode is not compliant
E0321 00:27:14.455868 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 00:27:14.455876 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 00:27:14.455881 543705 custom_config.go:64] query custom config with name: gpu
I0321 00:27:14.456820 543705 disk_worker.go:494] system disk:vda1
I0321 00:27:14.456865 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 00:27:15.456821 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 00:27:15.456830 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:27:16.457941 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 00:27:16.457949 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 00:27:16.457993 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:27:16.458009 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:27:16.472323 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:27:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:27:23.409787 543705 cpu.go:275] no items to output this cycle
I0321 00:27:23.409789 543705 memory.go:184] no items to output this cycle
I0321 00:27:26.841393 543705 disk_info.go:125] begin check local disk info of client
I0321 00:27:26.843920 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:27:26.843926 543705 disk_info.go:196] parse disk info done, disk is : [0xc00056dcc0 0xc00056dd00]
E0321 00:27:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:27:33.409802 543705 memory.go:184] no items to output this cycle
I0321 00:27:33.409815 543705 cpu.go:275] no items to output this cycle
I0321 00:27:38.671361 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 00:27:38.671375 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 00:27:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:27:43.410591 543705 memory.go:191] Add success.
I0321 00:27:43.409816 543705 cpu.go:282] Add success.
I0321 00:27:43.420297 543705 net.go:648] Add success.
I0321 00:27:43.423027 543705 net.go:770] primary dev: ETH0
I0321 00:27:43.423046 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:27:43.423061 543705 net.go:698] Add success.
I0321 00:27:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:27:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:27:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:27:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:27:53.409797 543705 memory.go:184] no items to output this cycle
I0321 00:27:53.409807 543705 cpu.go:275] no items to output this cycle
E0321 00:28:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:28:03.409765 543705 memory.go:184] no items to output this cycle
I0321 00:28:03.409799 543705 cpu.go:275] no items to output this cycle
E0321 00:28:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:28:13.409823 543705 memory.go:191] Add success.
I0321 00:28:13.409834 543705 cpu.go:282] Add success.
W0321 00:28:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:28:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:28:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:28:13.420252 543705 net.go:648] Add success.
I0321 00:28:13.423337 543705 net.go:770] primary dev: ETH0
I0321 00:28:13.423352 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:28:13.423368 543705 net.go:698] Add success.
I0321 00:28:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:28:14.455185 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:28:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0321 00:28:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:28:14.456594 543705 disk_worker.go:494] system disk:vda1
I0321 00:28:14.456623 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:28:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:28:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:28:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:28:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:28:16.472391 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:28:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:28:23.409784 543705 memory.go:184] no items to output this cycle
I0321 00:28:23.409794 543705 cpu.go:275] no items to output this cycle
I0321 00:28:26.845424 543705 disk_info.go:125] begin check local disk info of client
I0321 00:28:26.847995 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:28:26.848002 543705 disk_info.go:196] parse disk info done, disk is : [0xc000344700 0xc000344740]
E0321 00:28:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:28:33.409777 543705 memory.go:184] no items to output this cycle
I0321 00:28:33.409802 543705 cpu.go:275] no items to output this cycle
E0321 00:28:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:28:43.409820 543705 memory.go:191] Add success.
I0321 00:28:43.409824 543705 cpu.go:282] Add success.
I0321 00:28:43.420121 543705 net.go:648] Add success.
I0321 00:28:43.422853 543705 net.go:770] primary dev: ETH0
I0321 00:28:43.422866 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:28:43.422878 543705 net.go:698] Add success.
I0321 00:28:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:28:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:28:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:28:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:28:53.409801 543705 memory.go:184] no items to output this cycle
I0321 00:28:53.409811 543705 cpu.go:275] no items to output this cycle
E0321 00:29:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:29:03.409795 543705 cpu.go:275] no items to output this cycle
I0321 00:29:03.409807 543705 memory.go:184] no items to output this cycle
E0321 00:29:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:29:13.409803 543705 memory.go:191] Add success.
I0321 00:29:13.409809 543705 cpu.go:282] Add success.
W0321 00:29:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:29:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:29:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:29:13.420059 543705 net.go:648] Add success.
I0321 00:29:13.422641 543705 net.go:770] primary dev: ETH0
I0321 00:29:13.422657 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:29:13.422671 543705 net.go:698] Add success.
I0321 00:29:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:29:14.455162 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:29:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0321 00:29:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:29:14.456508 543705 disk_worker.go:494] system disk:vda1
I0321 00:29:14.456555 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:29:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:29:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:29:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:29:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:29:16.472374 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:29:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:29:23.409775 543705 memory.go:184] no items to output this cycle
I0321 00:29:23.409813 543705 cpu.go:275] no items to output this cycle
I0321 00:29:26.848084 543705 disk_info.go:125] begin check local disk info of client
I0321 00:29:26.850854 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:29:26.850860 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039cc00 0xc00039cc40]
E0321 00:29:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:29:33.409785 543705 cpu.go:275] no items to output this cycle
I0321 00:29:33.409789 543705 memory.go:184] no items to output this cycle
E0321 00:29:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:29:43.409788 543705 memory.go:191] Add success.
I0321 00:29:43.409808 543705 cpu.go:282] Add success.
I0321 00:29:43.420002 543705 net.go:648] Add success.
I0321 00:29:43.423146 543705 net.go:770] primary dev: ETH0
I0321 00:29:43.423159 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:29:43.423171 543705 net.go:698] Add success.
I0321 00:29:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:29:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:29:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:29:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:29:53.409774 543705 memory.go:184] no items to output this cycle
I0321 00:29:53.409782 543705 cpu.go:275] no items to output this cycle
E0321 00:30:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:30:03.409791 543705 memory.go:184] no items to output this cycle
I0321 00:30:03.409807 543705 cpu.go:275] no items to output this cycle
E0321 00:30:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:30:13.409804 543705 memory.go:191] Add success.
I0321 00:30:13.409805 543705 cpu.go:282] Add success.
W0321 00:30:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:30:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:30:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:30:13.420126 543705 net.go:648] Add success.
I0321 00:30:13.422957 543705 net.go:770] primary dev: ETH0
I0321 00:30:13.422972 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:30:13.422986 543705 net.go:698] Add success.
I0321 00:30:13.468757 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"bf539d49-c227-4f4b-bce9-8c91aa10ff50","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 00:30:13.468797 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 00:30:14.454952 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:30:14.455175 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:30:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0321 00:30:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:30:14.456679 543705 disk_worker.go:494] system disk:vda1
I0321 00:30:14.456724 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:30:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:30:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:30:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:30:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:30:16.472374 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:30:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:30:23.409788 543705 memory.go:184] no items to output this cycle
I0321 00:30:23.409791 543705 cpu.go:275] no items to output this cycle
I0321 00:30:26.852416 543705 disk_info.go:125] begin check local disk info of client
I0321 00:30:26.855028 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:30:26.855034 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004994c0 0xc000499500]
E0321 00:30:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:30:33.409778 543705 memory.go:184] no items to output this cycle
I0321 00:30:33.409790 543705 cpu.go:275] no items to output this cycle
I0321 00:30:38.671520 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 00:30:38.671527 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 00:30:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:30:43.410633 543705 memory.go:191] Add success.
I0321 00:30:43.409800 543705 cpu.go:282] Add success.
I0321 00:30:43.419748 543705 net.go:648] Add success.
I0321 00:30:43.422440 543705 net.go:770] primary dev: ETH0
I0321 00:30:43.422455 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:30:43.422469 543705 net.go:698] Add success.
I0321 00:30:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:30:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:30:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:30:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:30:53.409774 543705 memory.go:184] no items to output this cycle
I0321 00:30:53.409780 543705 cpu.go:275] no items to output this cycle
E0321 00:31:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:31:03.409781 543705 memory.go:184] no items to output this cycle
I0321 00:31:03.409783 543705 cpu.go:275] no items to output this cycle
E0321 00:31:13.409806 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:31:13.409843 543705 memory.go:191] Add success.
I0321 00:31:13.409853 543705 cpu.go:282] Add success.
W0321 00:31:13.409878 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:31:13.409895 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:31:13.409899 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:31:13.420232 543705 net.go:648] Add success.
I0321 00:31:13.422935 543705 net.go:770] primary dev: ETH0
I0321 00:31:13.422950 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:31:13.422964 543705 net.go:698] Add success.
I0321 00:31:14.454982 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:31:14.455119 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:31:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0321 00:31:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:31:14.456508 543705 disk_worker.go:494] system disk:vda1
I0321 00:31:14.456554 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:31:15.455975 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:31:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:31:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:31:16.458080 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:31:16.472477 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:31:23.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:31:23.409809 543705 memory.go:184] no items to output this cycle
I0321 00:31:23.409816 543705 cpu.go:275] no items to output this cycle
I0321 00:31:26.856487 543705 disk_info.go:125] begin check local disk info of client
I0321 00:31:26.859089 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:31:26.859096 543705 disk_info.go:196] parse disk info done, disk is : [0xc000357100 0xc000357140]
E0321 00:31:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:31:33.409770 543705 memory.go:184] no items to output this cycle
I0321 00:31:33.409804 543705 cpu.go:275] no items to output this cycle
E0321 00:31:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:31:43.409805 543705 memory.go:191] Add success.
I0321 00:31:43.409807 543705 cpu.go:282] Add success.
I0321 00:31:43.420338 543705 net.go:648] Add success.
I0321 00:31:43.422900 543705 net.go:770] primary dev: ETH0
I0321 00:31:43.422913 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:31:43.422924 543705 net.go:698] Add success.
I0321 00:31:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:31:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:31:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:31:53.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:31:53.409812 543705 memory.go:184] no items to output this cycle
I0321 00:31:53.409820 543705 cpu.go:275] no items to output this cycle
E0321 00:32:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:32:03.409798 543705 memory.go:184] no items to output this cycle
I0321 00:32:03.409810 543705 cpu.go:275] no items to output this cycle
E0321 00:32:13.409794 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:32:13.409831 543705 memory.go:191] Add success.
I0321 00:32:13.409835 543705 cpu.go:282] Add success.
W0321 00:32:13.409863 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:32:13.409880 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:32:13.409884 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:32:13.420126 543705 net.go:770] primary dev: ETH0
I0321 00:32:13.420140 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:32:13.420152 543705 net.go:698] Add success.
I0321 00:32:13.420382 543705 net.go:648] Add success.
W0321 00:32:14.455173 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:32:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0321 00:32:14.455186 543705 disk_worker.go:728] disk inode is not compliant
E0321 00:32:14.455895 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 00:32:14.455903 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 00:32:14.455909 543705 custom_config.go:64] query custom config with name: gpu
I0321 00:32:14.456563 543705 disk_worker.go:494] system disk:vda1
I0321 00:32:14.456593 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 00:32:15.456797 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 00:32:15.456806 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:32:16.457911 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 00:32:16.457910 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 00:32:16.457964 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:32:16.457986 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:32:16.472316 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:32:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:32:23.409790 543705 memory.go:184] no items to output this cycle
I0321 00:32:23.409798 543705 cpu.go:275] no items to output this cycle
I0321 00:32:26.859184 543705 disk_info.go:125] begin check local disk info of client
I0321 00:32:26.861612 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:32:26.861619 543705 disk_info.go:196] parse disk info done, disk is : [0xc000345580 0xc0003455c0]
E0321 00:32:33.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:32:33.409810 543705 memory.go:184] no items to output this cycle
I0321 00:32:33.409828 543705 cpu.go:275] no items to output this cycle
E0321 00:32:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:32:43.409789 543705 memory.go:191] Add success.
I0321 00:32:43.409818 543705 cpu.go:282] Add success.
I0321 00:32:43.420201 543705 net.go:648] Add success.
I0321 00:32:43.422881 543705 net.go:770] primary dev: ETH0
I0321 00:32:43.422894 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:32:43.422905 543705 net.go:698] Add success.
I0321 00:32:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:32:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:32:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:32:53.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:32:53.409761 543705 memory.go:184] no items to output this cycle
I0321 00:32:53.409805 543705 cpu.go:275] no items to output this cycle
E0321 00:33:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:33:03.409796 543705 memory.go:184] no items to output this cycle
I0321 00:33:03.409825 543705 cpu.go:275] no items to output this cycle
E0321 00:33:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:33:13.409802 543705 memory.go:191] Add success.
I0321 00:33:13.409807 543705 cpu.go:282] Add success.
W0321 00:33:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:33:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:33:13.409856 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:33:13.420130 543705 net.go:648] Add success.
I0321 00:33:13.422895 543705 net.go:770] primary dev: ETH0
I0321 00:33:13.422910 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:33:13.422923 543705 net.go:698] Add success.
I0321 00:33:13.470288 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"c77ae8b7-b689-44ed-b91f-46d17e23358e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 00:33:13.470323 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 00:33:14.454955 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:33:14.455105 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:33:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0321 00:33:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:33:14.456505 543705 disk_worker.go:494] system disk:vda1
I0321 00:33:14.456550 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:33:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:33:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:33:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:33:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:33:16.472400 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:33:23.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:33:23.409811 543705 memory.go:184] no items to output this cycle
I0321 00:33:23.409819 543705 cpu.go:275] no items to output this cycle
I0321 00:33:26.861676 543705 disk_info.go:125] begin check local disk info of client
I0321 00:33:26.864125 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:33:26.864132 543705 disk_info.go:196] parse disk info done, disk is : [0xc000345000 0xc000345040]
E0321 00:33:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:33:33.409796 543705 memory.go:184] no items to output this cycle
I0321 00:33:33.409811 543705 cpu.go:275] no items to output this cycle
I0321 00:33:38.672056 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 00:33:38.672062 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 00:33:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:33:43.410558 543705 memory.go:191] Add success.
I0321 00:33:43.409810 543705 cpu.go:282] Add success.
I0321 00:33:43.420281 543705 net.go:648] Add success.
I0321 00:33:43.422853 543705 net.go:770] primary dev: ETH0
I0321 00:33:43.422876 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:33:43.422888 543705 net.go:698] Add success.
I0321 00:33:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:33:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:33:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:33:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:33:53.409786 543705 memory.go:184] no items to output this cycle
I0321 00:33:53.409800 543705 cpu.go:275] no items to output this cycle
E0321 00:34:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:34:03.409801 543705 memory.go:184] no items to output this cycle
I0321 00:34:03.409815 543705 cpu.go:275] no items to output this cycle
E0321 00:34:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:34:13.409799 543705 memory.go:191] Add success.
I0321 00:34:13.409801 543705 cpu.go:282] Add success.
W0321 00:34:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:34:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:34:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:34:13.420114 543705 net.go:648] Add success.
I0321 00:34:13.423219 543705 net.go:770] primary dev: ETH0
I0321 00:34:13.423233 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:34:13.423248 543705 net.go:698] Add success.
I0321 00:34:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:34:14.455204 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:34:14.455215 543705 disk_worker.go:708] disk space is not compliant
W0321 00:34:14.455218 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:34:14.456602 543705 disk_worker.go:494] system disk:vda1
I0321 00:34:14.456633 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:34:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:34:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:34:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:34:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:34:16.472427 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:34:23.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:34:23.409815 543705 memory.go:184] no items to output this cycle
I0321 00:34:23.409825 543705 cpu.go:275] no items to output this cycle
I0321 00:34:26.865503 543705 disk_info.go:125] begin check local disk info of client
I0321 00:34:26.868021 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:34:26.868028 543705 disk_info.go:196] parse disk info done, disk is : [0xc000473e00 0xc000473e40]
E0321 00:34:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:34:33.409781 543705 memory.go:184] no items to output this cycle
I0321 00:34:33.409789 543705 cpu.go:275] no items to output this cycle
E0321 00:34:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:34:43.409792 543705 memory.go:191] Add success.
I0321 00:34:43.409803 543705 cpu.go:282] Add success.
I0321 00:34:43.419896 543705 net.go:648] Add success.
I0321 00:34:43.422528 543705 net.go:770] primary dev: ETH0
I0321 00:34:43.422541 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:34:43.422554 543705 net.go:698] Add success.
I0321 00:34:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:34:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:34:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:34:53.410505 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:34:53.410531 543705 memory.go:184] no items to output this cycle
I0321 00:34:53.410578 543705 cpu.go:275] no items to output this cycle
E0321 00:35:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:35:03.409782 543705 memory.go:184] no items to output this cycle
I0321 00:35:03.409797 543705 cpu.go:275] no items to output this cycle
E0321 00:35:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:35:13.409799 543705 memory.go:191] Add success.
I0321 00:35:13.409803 543705 cpu.go:282] Add success.
W0321 00:35:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:35:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:35:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:35:13.420064 543705 net.go:648] Add success.
I0321 00:35:13.422730 543705 net.go:770] primary dev: ETH0
I0321 00:35:13.422744 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:35:13.422759 543705 net.go:698] Add success.
I0321 00:35:14.454952 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:35:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:35:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0321 00:35:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:35:14.456593 543705 disk_worker.go:494] system disk:vda1
I0321 00:35:14.456625 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:35:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:35:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:35:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:35:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:35:16.472397 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:35:23.410379 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:35:23.410396 543705 memory.go:184] no items to output this cycle
I0321 00:35:23.410405 543705 cpu.go:275] no items to output this cycle
I0321 00:35:26.869425 543705 disk_info.go:125] begin check local disk info of client
I0321 00:35:26.871927 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:35:26.871933 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b5980 0xc0002b59c0]
E0321 00:35:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:35:33.409798 543705 memory.go:184] no items to output this cycle
I0321 00:35:33.409812 543705 cpu.go:275] no items to output this cycle
E0321 00:35:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:35:43.409815 543705 memory.go:191] Add success.
I0321 00:35:43.409822 543705 cpu.go:282] Add success.
I0321 00:35:43.419892 543705 net.go:648] Add success.
I0321 00:35:43.422635 543705 net.go:770] primary dev: ETH0
I0321 00:35:43.422649 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:35:43.422661 543705 net.go:698] Add success.
I0321 00:35:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:35:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:35:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:35:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:35:53.409854 543705 memory.go:184] no items to output this cycle
I0321 00:35:53.409927 543705 cpu.go:275] no items to output this cycle
E0321 00:36:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:36:03.409796 543705 memory.go:184] no items to output this cycle
I0321 00:36:03.409808 543705 cpu.go:275] no items to output this cycle
E0321 00:36:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:36:13.409820 543705 memory.go:191] Add success.
I0321 00:36:13.409826 543705 cpu.go:282] Add success.
W0321 00:36:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:36:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:36:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:36:13.420204 543705 net.go:648] Add success.
I0321 00:36:13.422835 543705 net.go:770] primary dev: ETH0
I0321 00:36:13.422848 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:36:13.422861 543705 net.go:698] Add success.
I0321 00:36:13.468277 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a07847d2-bdbc-4abd-b231-731487ab37fe","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 00:36:13.468311 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 00:36:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:36:14.455214 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:36:14.455225 543705 disk_worker.go:708] disk space is not compliant
W0321 00:36:14.455227 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:36:14.456639 543705 disk_worker.go:494] system disk:vda1
I0321 00:36:14.456670 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:36:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:36:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:36:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:36:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:36:16.472394 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:36:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:36:23.409775 543705 memory.go:184] no items to output this cycle
I0321 00:36:23.409807 543705 cpu.go:275] no items to output this cycle
I0321 00:36:26.873612 543705 disk_info.go:125] begin check local disk info of client
I0321 00:36:26.876246 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:36:26.876253 543705 disk_info.go:196] parse disk info done, disk is : [0xc000342bc0 0xc000342c00]
E0321 00:36:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:36:33.409796 543705 memory.go:184] no items to output this cycle
I0321 00:36:33.409815 543705 cpu.go:275] no items to output this cycle
I0321 00:36:38.672198 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 00:36:38.672205 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 00:36:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:36:43.410478 543705 memory.go:191] Add success.
I0321 00:36:43.409800 543705 cpu.go:282] Add success.
I0321 00:36:43.420157 543705 net.go:648] Add success.
I0321 00:36:43.422662 543705 net.go:770] primary dev: ETH0
I0321 00:36:43.422675 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:36:43.422688 543705 net.go:698] Add success.
I0321 00:36:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:36:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:36:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:36:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:36:53.409872 543705 cpu.go:275] no items to output this cycle
I0321 00:36:53.409882 543705 memory.go:184] no items to output this cycle
E0321 00:37:03.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:37:03.409791 543705 memory.go:184] no items to output this cycle
I0321 00:37:03.409807 543705 cpu.go:275] no items to output this cycle
E0321 00:37:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:37:13.409800 543705 memory.go:191] Add success.
I0321 00:37:13.409801 543705 cpu.go:282] Add success.
W0321 00:37:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:37:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:37:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:37:13.420138 543705 net.go:648] Add success.
I0321 00:37:13.423175 543705 net.go:770] primary dev: ETH0
I0321 00:37:13.423188 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:37:13.423201 543705 net.go:698] Add success.
I0321 00:37:13.452787 543705 event_worker.go:152] Polling the log file for events...
W0321 00:37:14.455104 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:37:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0321 00:37:14.455177 543705 disk_worker.go:728] disk inode is not compliant
E0321 00:37:14.456883 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 00:37:14.456893 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 00:37:14.456899 543705 custom_config.go:64] query custom config with name: gpu
I0321 00:37:14.456973 543705 disk_worker.go:494] system disk:vda1
I0321 00:37:14.457016 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 00:37:15.456824 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 00:37:15.456832 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:37:16.457920 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 00:37:16.457929 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 00:37:16.457972 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:37:16.457989 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:37:16.472313 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:37:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:37:23.409781 543705 memory.go:184] no items to output this cycle
I0321 00:37:23.409783 543705 cpu.go:275] no items to output this cycle
I0321 00:37:26.877583 543705 disk_info.go:125] begin check local disk info of client
I0321 00:37:26.880120 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:37:26.880126 543705 disk_info.go:196] parse disk info done, disk is : [0xc000343140 0xc000343180]
E0321 00:37:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:37:33.409771 543705 memory.go:184] no items to output this cycle
I0321 00:37:33.409792 543705 cpu.go:275] no items to output this cycle
E0321 00:37:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:37:43.409806 543705 memory.go:191] Add success.
I0321 00:37:43.409815 543705 cpu.go:282] Add success.
I0321 00:37:43.420007 543705 net.go:648] Add success.
I0321 00:37:43.422550 543705 net.go:770] primary dev: ETH0
I0321 00:37:43.422564 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:37:43.422576 543705 net.go:698] Add success.
I0321 00:37:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:37:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:37:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:37:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:37:53.409775 543705 cpu.go:275] no items to output this cycle
I0321 00:37:53.409777 543705 memory.go:184] no items to output this cycle
E0321 00:38:03.409884 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:38:03.409903 543705 memory.go:184] no items to output this cycle
I0321 00:38:03.409958 543705 cpu.go:275] no items to output this cycle
E0321 00:38:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:38:13.409793 543705 memory.go:191] Add success.
I0321 00:38:13.409817 543705 cpu.go:282] Add success.
W0321 00:38:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:38:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:38:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:38:13.420169 543705 net.go:648] Add success.
I0321 00:38:13.423159 543705 net.go:770] primary dev: ETH0
I0321 00:38:13.423175 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:38:13.423188 543705 net.go:698] Add success.
I0321 00:38:14.454952 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:38:14.455197 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:38:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0321 00:38:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:38:14.456589 543705 disk_worker.go:494] system disk:vda1
I0321 00:38:14.456620 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:38:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:38:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:38:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:38:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:38:16.472360 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:38:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:38:23.409773 543705 memory.go:184] no items to output this cycle
I0321 00:38:23.409796 543705 cpu.go:275] no items to output this cycle
I0321 00:38:26.881656 543705 disk_info.go:125] begin check local disk info of client
I0321 00:38:26.884447 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:38:26.884454 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007bec0 0xc00007bf00]
E0321 00:38:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:38:33.409764 543705 memory.go:184] no items to output this cycle
I0321 00:38:33.409784 543705 cpu.go:275] no items to output this cycle
E0321 00:38:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:38:43.409780 543705 memory.go:191] Add success.
I0321 00:38:43.409792 543705 cpu.go:282] Add success.
I0321 00:38:43.419855 543705 net.go:648] Add success.
I0321 00:38:43.421004 543705 net.go:770] primary dev: ETH0
I0321 00:38:43.421017 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:38:43.421029 543705 net.go:698] Add success.
I0321 00:38:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:38:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:38:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:38:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:38:53.409799 543705 memory.go:184] no items to output this cycle
I0321 00:38:53.409809 543705 cpu.go:275] no items to output this cycle
E0321 00:39:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:39:03.409779 543705 memory.go:184] no items to output this cycle
I0321 00:39:03.409781 543705 cpu.go:275] no items to output this cycle
E0321 00:39:13.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:39:13.409804 543705 memory.go:191] Add success.
I0321 00:39:13.409806 543705 cpu.go:282] Add success.
W0321 00:39:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:39:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:39:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:39:13.420185 543705 net.go:648] Add success.
I0321 00:39:13.423250 543705 net.go:770] primary dev: ETH0
I0321 00:39:13.423266 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:39:13.423280 543705 net.go:698] Add success.
I0321 00:39:13.556113 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"bf3fcf07-f1fb-4582-8f14-1cd6c8c7754f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 00:39:13.556146 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 00:39:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:39:14.455140 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:39:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0321 00:39:14.455217 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:39:14.456600 543705 disk_worker.go:494] system disk:vda1
I0321 00:39:14.456629 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:39:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:39:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:39:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:39:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:39:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:39:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:39:23.409780 543705 memory.go:184] no items to output this cycle
I0321 00:39:23.409792 543705 cpu.go:275] no items to output this cycle
I0321 00:39:26.885626 543705 disk_info.go:125] begin check local disk info of client
I0321 00:39:26.888190 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:39:26.888196 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4640 0xc0000c4680]
E0321 00:39:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:39:33.409792 543705 memory.go:184] no items to output this cycle
I0321 00:39:33.409805 543705 cpu.go:275] no items to output this cycle
I0321 00:39:38.673064 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 00:39:38.673071 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 00:39:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:39:43.410632 543705 memory.go:191] Add success.
I0321 00:39:43.409819 543705 cpu.go:282] Add success.
I0321 00:39:43.420311 543705 net.go:648] Add success.
I0321 00:39:43.423157 543705 net.go:770] primary dev: ETH0
I0321 00:39:43.423170 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:39:43.423183 543705 net.go:698] Add success.
I0321 00:39:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:39:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:39:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:39:53.410250 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:39:53.410270 543705 memory.go:184] no items to output this cycle
I0321 00:39:53.410282 543705 cpu.go:275] no items to output this cycle
E0321 00:40:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:40:03.409783 543705 memory.go:184] no items to output this cycle
I0321 00:40:03.409784 543705 cpu.go:275] no items to output this cycle
E0321 00:40:13.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:40:13.409799 543705 memory.go:191] Add success.
I0321 00:40:13.409815 543705 cpu.go:282] Add success.
W0321 00:40:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:40:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:40:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:40:13.420193 543705 net.go:648] Add success.
I0321 00:40:13.422940 543705 net.go:770] primary dev: ETH0
I0321 00:40:13.422954 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:40:13.422966 543705 net.go:698] Add success.
I0321 00:40:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:40:14.455214 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:40:14.455226 543705 disk_worker.go:708] disk space is not compliant
W0321 00:40:14.455228 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:40:14.456605 543705 disk_worker.go:494] system disk:vda1
I0321 00:40:14.456634 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:40:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:40:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:40:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:40:16.458076 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:40:16.472436 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:40:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:40:23.409790 543705 memory.go:184] no items to output this cycle
I0321 00:40:23.409794 543705 cpu.go:275] no items to output this cycle
I0321 00:40:26.889674 543705 disk_info.go:125] begin check local disk info of client
I0321 00:40:26.892255 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:40:26.892262 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5040 0xc0000c5080]
E0321 00:40:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:40:33.409794 543705 memory.go:184] no items to output this cycle
I0321 00:40:33.409807 543705 cpu.go:275] no items to output this cycle
E0321 00:40:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:40:43.409811 543705 memory.go:191] Add success.
I0321 00:40:43.409820 543705 cpu.go:282] Add success.
I0321 00:40:43.420041 543705 net.go:648] Add success.
I0321 00:40:43.422699 543705 net.go:770] primary dev: ETH0
I0321 00:40:43.422712 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:40:43.422725 543705 net.go:698] Add success.
I0321 00:40:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:40:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:40:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:40:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:40:53.409772 543705 memory.go:184] no items to output this cycle
I0321 00:40:53.409780 543705 cpu.go:275] no items to output this cycle
E0321 00:41:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:41:03.409780 543705 memory.go:184] no items to output this cycle
I0321 00:41:03.409783 543705 cpu.go:275] no items to output this cycle
E0321 00:41:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:41:13.409821 543705 memory.go:191] Add success.
I0321 00:41:13.409827 543705 cpu.go:282] Add success.
W0321 00:41:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:41:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:41:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:41:13.419712 543705 net.go:648] Add success.
I0321 00:41:13.422346 543705 net.go:770] primary dev: ETH0
I0321 00:41:13.422359 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:41:13.422371 543705 net.go:698] Add success.
I0321 00:41:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:41:14.455130 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:41:14.455213 543705 disk_worker.go:708] disk space is not compliant
W0321 00:41:14.455216 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:41:14.456600 543705 disk_worker.go:494] system disk:vda1
I0321 00:41:14.456628 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:41:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:41:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:41:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:41:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:41:16.472413 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:41:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:41:23.409785 543705 memory.go:184] no items to output this cycle
I0321 00:41:23.409788 543705 cpu.go:275] no items to output this cycle
I0321 00:41:26.893674 543705 disk_info.go:125] begin check local disk info of client
I0321 00:41:26.896193 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:41:26.896199 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c41c0 0xc0000c4200]
E0321 00:41:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:41:33.409764 543705 memory.go:184] no items to output this cycle
I0321 00:41:33.409801 543705 cpu.go:275] no items to output this cycle
E0321 00:41:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:41:43.409786 543705 memory.go:191] Add success.
I0321 00:41:43.409791 543705 cpu.go:282] Add success.
I0321 00:41:43.419829 543705 net.go:770] primary dev: ETH0
I0321 00:41:43.419842 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:41:43.419855 543705 net.go:698] Add success.
I0321 00:41:43.420184 543705 net.go:648] Add success.
I0321 00:41:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:41:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:41:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:41:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:41:53.409779 543705 memory.go:184] no items to output this cycle
I0321 00:41:53.409786 543705 cpu.go:275] no items to output this cycle
E0321 00:42:03.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:42:03.409762 543705 memory.go:184] no items to output this cycle
I0321 00:42:03.409795 543705 cpu.go:275] no items to output this cycle
E0321 00:42:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:42:13.409800 543705 memory.go:191] Add success.
I0321 00:42:13.409806 543705 cpu.go:282] Add success.
W0321 00:42:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:42:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:42:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:42:13.420340 543705 net.go:648] Add success.
I0321 00:42:13.423105 543705 net.go:770] primary dev: ETH0
I0321 00:42:13.423120 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:42:13.423133 543705 net.go:698] Add success.
I0321 00:42:13.469922 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"5f547862-e453-4584-929e-eca29fb80bda","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 00:42:13.469952 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0321 00:42:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:42:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0321 00:42:14.455178 543705 disk_worker.go:728] disk inode is not compliant
E0321 00:42:14.455914 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 00:42:14.455922 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 00:42:14.455927 543705 custom_config.go:64] query custom config with name: gpu
I0321 00:42:14.456560 543705 disk_worker.go:494] system disk:vda1
I0321 00:42:14.456590 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 00:42:15.456826 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 00:42:15.456834 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:42:16.457907 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 00:42:16.457906 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 00:42:16.457964 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:42:16.457984 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:42:16.472354 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:42:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:42:23.409807 543705 memory.go:184] no items to output this cycle
I0321 00:42:23.409817 543705 cpu.go:275] no items to output this cycle
I0321 00:42:26.897673 543705 disk_info.go:125] begin check local disk info of client
I0321 00:42:26.900197 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:42:26.900204 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e6ec0 0xc0003e6f00]
E0321 00:42:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:42:33.409766 543705 memory.go:184] no items to output this cycle
I0321 00:42:33.409796 543705 cpu.go:275] no items to output this cycle
I0321 00:42:38.673744 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 00:42:38.673750 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 00:42:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:42:43.410772 543705 memory.go:191] Add success.
I0321 00:42:43.409793 543705 cpu.go:282] Add success.
I0321 00:42:43.420473 543705 net.go:648] Add success.
I0321 00:42:43.423757 543705 net.go:770] primary dev: ETH0
I0321 00:42:43.423771 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:42:43.423785 543705 net.go:698] Add success.
I0321 00:42:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:42:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:42:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:42:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:42:53.409782 543705 memory.go:184] no items to output this cycle
I0321 00:42:53.409785 543705 cpu.go:275] no items to output this cycle
E0321 00:43:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:43:03.409801 543705 memory.go:184] no items to output this cycle
I0321 00:43:03.409813 543705 cpu.go:275] no items to output this cycle
W0321 00:43:13.409707 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:43:13.409724 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:43:13.409728 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
E0321 00:43:13.409800 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:43:13.409819 543705 memory.go:191] Add success.
I0321 00:43:13.409829 543705 cpu.go:282] Add success.
I0321 00:43:13.419713 543705 net.go:648] Add success.
I0321 00:43:13.422364 543705 net.go:770] primary dev: ETH0
I0321 00:43:13.422378 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:43:13.422392 543705 net.go:698] Add success.
I0321 00:43:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:43:14.455125 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:43:14.455203 543705 disk_worker.go:708] disk space is not compliant
W0321 00:43:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:43:14.456609 543705 disk_worker.go:494] system disk:vda1
I0321 00:43:14.456638 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:43:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:43:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:43:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:43:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:43:16.472435 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:43:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:43:23.409802 543705 memory.go:184] no items to output this cycle
I0321 00:43:23.409816 543705 cpu.go:275] no items to output this cycle
I0321 00:43:26.901672 543705 disk_info.go:125] begin check local disk info of client
I0321 00:43:26.904442 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:43:26.904449 543705 disk_info.go:196] parse disk info done, disk is : [0xc000377b00 0xc000377b40]
E0321 00:43:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:43:33.409771 543705 memory.go:184] no items to output this cycle
I0321 00:43:33.409793 543705 cpu.go:275] no items to output this cycle
E0321 00:43:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:43:43.409806 543705 memory.go:191] Add success.
I0321 00:43:43.409816 543705 cpu.go:282] Add success.
I0321 00:43:43.419851 543705 net.go:648] Add success.
I0321 00:43:43.422531 543705 net.go:770] primary dev: ETH0
I0321 00:43:43.422545 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:43:43.422557 543705 net.go:698] Add success.
I0321 00:43:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:43:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:43:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:43:53.410361 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:43:53.410382 543705 memory.go:184] no items to output this cycle
I0321 00:43:53.410391 543705 cpu.go:275] no items to output this cycle
E0321 00:44:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:44:03.409782 543705 memory.go:184] no items to output this cycle
I0321 00:44:03.409785 543705 cpu.go:275] no items to output this cycle
E0321 00:44:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:44:13.409795 543705 memory.go:191] Add success.
I0321 00:44:13.409796 543705 cpu.go:282] Add success.
W0321 00:44:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:44:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:44:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:44:13.420320 543705 net.go:648] Add success.
I0321 00:44:13.422918 543705 net.go:770] primary dev: ETH0
I0321 00:44:13.422932 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:44:13.422943 543705 net.go:698] Add success.
I0321 00:44:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:44:14.455095 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:44:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0321 00:44:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:44:14.456591 543705 disk_worker.go:494] system disk:vda1
I0321 00:44:14.456631 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:44:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:44:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:44:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:44:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:44:16.472438 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:44:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:44:23.409781 543705 memory.go:184] no items to output this cycle
I0321 00:44:23.409794 543705 cpu.go:275] no items to output this cycle
I0321 00:44:26.905673 543705 disk_info.go:125] begin check local disk info of client
I0321 00:44:26.908207 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:44:26.908213 543705 disk_info.go:196] parse disk info done, disk is : [0xc00056c480 0xc00056c4c0]
E0321 00:44:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:44:33.409797 543705 memory.go:184] no items to output this cycle
I0321 00:44:33.409810 543705 cpu.go:275] no items to output this cycle
E0321 00:44:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:44:43.409779 543705 memory.go:191] Add success.
I0321 00:44:43.409801 543705 cpu.go:282] Add success.
I0321 00:44:43.419869 543705 net.go:648] Add success.
I0321 00:44:43.422311 543705 net.go:770] primary dev: ETH0
I0321 00:44:43.422330 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:44:43.422346 543705 net.go:698] Add success.
I0321 00:44:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:44:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:44:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:44:53.410334 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:44:53.410343 543705 cpu.go:275] no items to output this cycle
I0321 00:44:53.410347 543705 memory.go:184] no items to output this cycle
E0321 00:45:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:45:03.409772 543705 memory.go:184] no items to output this cycle
I0321 00:45:03.409792 543705 cpu.go:275] no items to output this cycle
E0321 00:45:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:45:13.409795 543705 memory.go:191] Add success.
I0321 00:45:13.409810 543705 cpu.go:282] Add success.
W0321 00:45:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:45:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:45:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:45:13.420056 543705 net.go:648] Add success.
I0321 00:45:13.422519 543705 net.go:770] primary dev: ETH0
I0321 00:45:13.422533 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:45:13.422544 543705 net.go:698] Add success.
I0321 00:45:13.463628 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1d7f611a-d127-418b-b150-32546d22fc61","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 00:45:13.463665 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 00:45:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:45:14.455364 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:45:14.455382 543705 disk_worker.go:708] disk space is not compliant
W0321 00:45:14.455386 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:45:14.457040 543705 disk_worker.go:494] system disk:vda1
I0321 00:45:14.457070 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:45:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:45:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:45:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:45:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:45:16.472393 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:45:23.410281 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:45:23.410300 543705 memory.go:184] no items to output this cycle
I0321 00:45:23.410303 543705 cpu.go:275] no items to output this cycle
I0321 00:45:26.909678 543705 disk_info.go:125] begin check local disk info of client
I0321 00:45:26.912404 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:45:26.912411 543705 disk_info.go:196] parse disk info done, disk is : [0xc0005492c0 0xc000549300]
E0321 00:45:33.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:45:33.409761 543705 memory.go:184] no items to output this cycle
I0321 00:45:33.409791 543705 cpu.go:275] no items to output this cycle
I0321 00:45:38.675064 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 00:45:38.675071 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 00:45:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:45:43.410619 543705 memory.go:191] Add success.
I0321 00:45:43.409814 543705 cpu.go:282] Add success.
I0321 00:45:43.420328 543705 net.go:648] Add success.
I0321 00:45:43.422938 543705 net.go:770] primary dev: ETH0
I0321 00:45:43.422953 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:45:43.422966 543705 net.go:698] Add success.
I0321 00:45:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:45:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:45:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:45:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:45:53.409790 543705 memory.go:184] no items to output this cycle
I0321 00:45:53.409791 543705 cpu.go:275] no items to output this cycle
E0321 00:46:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:46:03.409781 543705 memory.go:184] no items to output this cycle
I0321 00:46:03.409785 543705 cpu.go:275] no items to output this cycle
E0321 00:46:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:46:13.409821 543705 memory.go:191] Add success.
I0321 00:46:13.409831 543705 cpu.go:282] Add success.
W0321 00:46:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:46:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:46:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:46:13.420268 543705 net.go:648] Add success.
I0321 00:46:13.422769 543705 net.go:770] primary dev: ETH0
I0321 00:46:13.422785 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:46:13.422799 543705 net.go:698] Add success.
I0321 00:46:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:46:14.455435 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:46:14.455450 543705 disk_worker.go:708] disk space is not compliant
W0321 00:46:14.455459 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:46:14.457078 543705 disk_worker.go:494] system disk:vda1
I0321 00:46:14.457106 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:46:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:46:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:46:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:46:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:46:16.472384 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:46:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:46:23.409780 543705 memory.go:184] no items to output this cycle
I0321 00:46:23.409801 543705 cpu.go:275] no items to output this cycle
I0321 00:46:26.913677 543705 disk_info.go:125] begin check local disk info of client
I0321 00:46:26.916202 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:46:26.916208 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007bb00 0xc00007bb40]
E0321 00:46:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:46:33.409790 543705 memory.go:184] no items to output this cycle
I0321 00:46:33.409803 543705 cpu.go:275] no items to output this cycle
E0321 00:46:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:46:43.409774 543705 memory.go:191] Add success.
I0321 00:46:43.409799 543705 cpu.go:282] Add success.
I0321 00:46:43.419854 543705 net.go:648] Add success.
I0321 00:46:43.422527 543705 net.go:770] primary dev: ETH0
I0321 00:46:43.422541 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:46:43.422564 543705 net.go:698] Add success.
I0321 00:46:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:46:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:46:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:46:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:46:53.409765 543705 memory.go:184] no items to output this cycle
I0321 00:46:53.409793 543705 cpu.go:275] no items to output this cycle
E0321 00:47:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:47:03.409780 543705 memory.go:184] no items to output this cycle
I0321 00:47:03.409779 543705 cpu.go:275] no items to output this cycle
E0321 00:47:13.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:47:13.409821 543705 memory.go:191] Add success.
I0321 00:47:13.409828 543705 cpu.go:282] Add success.
W0321 00:47:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:47:13.409870 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:47:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:47:13.420208 543705 net.go:648] Add success.
I0321 00:47:13.423260 543705 net.go:770] primary dev: ETH0
I0321 00:47:13.423279 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:47:13.423290 543705 net.go:698] Add success.
I0321 00:47:13.452804 543705 event_worker.go:152] Polling the log file for events...
W0321 00:47:14.455307 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:47:14.455451 543705 disk_worker.go:708] disk space is not compliant
W0321 00:47:14.455456 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:47:14.457493 543705 disk_worker.go:494] system disk:vda1
E0321 00:47:14.457567 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 00:47:14.457577 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 00:47:14.457584 543705 custom_config.go:64] query custom config with name: gpu
I0321 00:47:14.457624 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 00:47:15.456827 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 00:47:15.456835 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:47:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 00:47:16.457975 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 00:47:16.458017 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:47:16.458033 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:47:16.472351 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:47:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:47:23.409783 543705 cpu.go:275] no items to output this cycle
I0321 00:47:23.409790 543705 memory.go:184] no items to output this cycle
I0321 00:47:26.917674 543705 disk_info.go:125] begin check local disk info of client
I0321 00:47:26.920168 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:47:26.920174 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e69c0 0xc0003e6a00]
E0321 00:47:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:47:33.409765 543705 memory.go:184] no items to output this cycle
I0321 00:47:33.409812 543705 cpu.go:275] no items to output this cycle
E0321 00:47:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:47:43.409790 543705 memory.go:191] Add success.
I0321 00:47:43.409792 543705 cpu.go:282] Add success.
I0321 00:47:43.419902 543705 net.go:648] Add success.
I0321 00:47:43.422616 543705 net.go:770] primary dev: ETH0
I0321 00:47:43.422640 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:47:43.422652 543705 net.go:698] Add success.
I0321 00:47:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:47:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:47:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:47:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:47:53.409792 543705 memory.go:184] no items to output this cycle
I0321 00:47:53.409807 543705 cpu.go:275] no items to output this cycle
E0321 00:48:03.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:48:03.409786 543705 memory.go:184] no items to output this cycle
I0321 00:48:03.409787 543705 cpu.go:275] no items to output this cycle
E0321 00:48:13.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:48:13.409833 543705 memory.go:191] Add success.
I0321 00:48:13.409838 543705 cpu.go:282] Add success.
W0321 00:48:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:48:13.409882 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:48:13.409887 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:48:13.420126 543705 net.go:648] Add success.
I0321 00:48:13.423267 543705 net.go:770] primary dev: ETH0
I0321 00:48:13.423279 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:48:13.423292 543705 net.go:698] Add success.
I0321 00:48:13.474485 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f5fb8e5b-a436-4723-af1c-93941a311952","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 00:48:13.474520 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 00:48:14.454983 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:48:14.455392 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:48:14.455406 543705 disk_worker.go:708] disk space is not compliant
W0321 00:48:14.455554 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:48:14.457035 543705 disk_worker.go:494] system disk:vda1
I0321 00:48:14.457063 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:48:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:48:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:48:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:48:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:48:16.472396 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:48:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:48:23.409777 543705 memory.go:184] no items to output this cycle
I0321 00:48:23.409809 543705 cpu.go:275] no items to output this cycle
I0321 00:48:26.921672 543705 disk_info.go:125] begin check local disk info of client
I0321 00:48:26.924234 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:48:26.924242 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab7c0 0xc0001ab800]
E0321 00:48:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:48:33.409791 543705 memory.go:184] no items to output this cycle
I0321 00:48:33.409805 543705 cpu.go:275] no items to output this cycle
I0321 00:48:38.675211 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 00:48:38.675217 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 00:48:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:48:43.410668 543705 memory.go:191] Add success.
I0321 00:48:43.409799 543705 cpu.go:282] Add success.
I0321 00:48:43.420383 543705 net.go:648] Add success.
I0321 00:48:43.423339 543705 net.go:770] primary dev: ETH0
I0321 00:48:43.423351 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:48:43.423364 543705 net.go:698] Add success.
I0321 00:48:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:48:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:48:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:48:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:48:53.409767 543705 memory.go:184] no items to output this cycle
I0321 00:48:53.409795 543705 cpu.go:275] no items to output this cycle
E0321 00:49:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:49:03.409765 543705 memory.go:184] no items to output this cycle
I0321 00:49:03.409797 543705 cpu.go:275] no items to output this cycle
E0321 00:49:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:49:13.409791 543705 memory.go:191] Add success.
I0321 00:49:13.409797 543705 cpu.go:282] Add success.
W0321 00:49:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:49:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:49:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:49:13.420045 543705 net.go:648] Add success.
I0321 00:49:13.422855 543705 net.go:770] primary dev: ETH0
I0321 00:49:13.422869 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:49:13.422881 543705 net.go:698] Add success.
I0321 00:49:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:49:14.455370 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:49:14.455493 543705 disk_worker.go:708] disk space is not compliant
W0321 00:49:14.455498 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:49:14.457178 543705 disk_worker.go:494] system disk:vda1
I0321 00:49:14.457207 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:49:15.455947 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:49:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:49:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:49:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:49:16.472383 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:49:23.410365 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:49:23.410374 543705 cpu.go:275] no items to output this cycle
I0321 00:49:23.410382 543705 memory.go:184] no items to output this cycle
I0321 00:49:26.925671 543705 disk_info.go:125] begin check local disk info of client
I0321 00:49:26.928136 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:49:26.928142 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b1880 0xc0003b18c0]
E0321 00:49:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:49:33.409773 543705 memory.go:184] no items to output this cycle
I0321 00:49:33.409776 543705 cpu.go:275] no items to output this cycle
E0321 00:49:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:49:43.409790 543705 memory.go:191] Add success.
I0321 00:49:43.409792 543705 cpu.go:282] Add success.
I0321 00:49:43.419826 543705 net.go:648] Add success.
I0321 00:49:43.422865 543705 net.go:770] primary dev: ETH0
I0321 00:49:43.422878 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:49:43.422891 543705 net.go:698] Add success.
I0321 00:49:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:49:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:49:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:49:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:49:53.409768 543705 memory.go:184] no items to output this cycle
I0321 00:49:53.409779 543705 cpu.go:275] no items to output this cycle
E0321 00:50:03.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:50:03.409761 543705 memory.go:184] no items to output this cycle
I0321 00:50:03.409794 543705 cpu.go:275] no items to output this cycle
E0321 00:50:13.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:50:13.409799 543705 memory.go:191] Add success.
I0321 00:50:13.409803 543705 cpu.go:282] Add success.
W0321 00:50:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:50:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:50:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:50:13.420049 543705 net.go:648] Add success.
I0321 00:50:13.422806 543705 net.go:770] primary dev: ETH0
I0321 00:50:13.422824 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:50:13.422841 543705 net.go:698] Add success.
I0321 00:50:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:50:14.455160 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:50:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0321 00:50:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:50:14.456535 543705 disk_worker.go:494] system disk:vda1
I0321 00:50:14.456575 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:50:15.456015 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:50:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:50:16.458031 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:50:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:50:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:50:23.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:50:23.409813 543705 memory.go:184] no items to output this cycle
I0321 00:50:23.409825 543705 cpu.go:275] no items to output this cycle
I0321 00:50:26.929678 543705 disk_info.go:125] begin check local disk info of client
I0321 00:50:26.932252 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:50:26.932259 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c4180 0xc0000c41c0]
E0321 00:50:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:50:33.409794 543705 memory.go:184] no items to output this cycle
I0321 00:50:33.409808 543705 cpu.go:275] no items to output this cycle
E0321 00:50:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:50:43.409789 543705 memory.go:191] Add success.
I0321 00:50:43.409789 543705 cpu.go:282] Add success.
I0321 00:50:43.420034 543705 net.go:648] Add success.
I0321 00:50:43.422797 543705 net.go:770] primary dev: ETH0
I0321 00:50:43.422812 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:50:43.422827 543705 net.go:698] Add success.
I0321 00:50:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:50:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:50:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:50:53.410342 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:50:53.410357 543705 memory.go:184] no items to output this cycle
I0321 00:50:53.410383 543705 cpu.go:275] no items to output this cycle
E0321 00:51:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:51:03.409775 543705 memory.go:184] no items to output this cycle
I0321 00:51:03.409781 543705 cpu.go:275] no items to output this cycle
E0321 00:51:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:51:13.409819 543705 memory.go:191] Add success.
I0321 00:51:13.409823 543705 cpu.go:282] Add success.
W0321 00:51:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:51:13.409868 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:51:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:51:13.420170 543705 net.go:648] Add success.
I0321 00:51:13.422836 543705 net.go:770] primary dev: ETH0
I0321 00:51:13.422851 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:51:13.422863 543705 net.go:698] Add success.
I0321 00:51:13.468956 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8db016b3-607d-4761-8021-c9796ff5c5bf","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 00:51:13.468988 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 00:51:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:51:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:51:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0321 00:51:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:51:14.456963 543705 disk_worker.go:494] system disk:vda1
I0321 00:51:14.456994 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:51:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:51:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:51:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:51:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:51:16.472430 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:51:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:51:23.409783 543705 memory.go:184] no items to output this cycle
I0321 00:51:23.409785 543705 cpu.go:275] no items to output this cycle
I0321 00:51:26.933672 543705 disk_info.go:125] begin check local disk info of client
I0321 00:51:26.936124 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:51:26.936130 543705 disk_info.go:196] parse disk info done, disk is : [0xc000484940 0xc000484980]
E0321 00:51:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:51:33.409765 543705 memory.go:184] no items to output this cycle
I0321 00:51:33.409810 543705 cpu.go:275] no items to output this cycle
I0321 00:51:38.676084 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 00:51:38.676090 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 00:51:43.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:51:43.410756 543705 memory.go:191] Add success.
I0321 00:51:43.409847 543705 cpu.go:282] Add success.
I0321 00:51:43.420543 543705 net.go:648] Add success.
I0321 00:51:43.423280 543705 net.go:770] primary dev: ETH0
I0321 00:51:43.423300 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:51:43.423315 543705 net.go:698] Add success.
I0321 00:51:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:51:46.458067 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:51:46.458096 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:51:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:51:53.409764 543705 memory.go:184] no items to output this cycle
I0321 00:51:53.409803 543705 cpu.go:275] no items to output this cycle
E0321 00:52:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:52:03.409793 543705 memory.go:184] no items to output this cycle
I0321 00:52:03.409806 543705 cpu.go:275] no items to output this cycle
E0321 00:52:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:52:13.409787 543705 memory.go:191] Add success.
I0321 00:52:13.409808 543705 cpu.go:282] Add success.
W0321 00:52:13.409815 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:52:13.409827 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:52:13.409830 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:52:13.420135 543705 net.go:648] Add success.
I0321 00:52:13.422736 543705 net.go:770] primary dev: ETH0
I0321 00:52:13.422751 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:52:13.422763 543705 net.go:698] Add success.
W0321 00:52:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:52:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0321 00:52:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:52:14.456811 543705 disk_worker.go:494] system disk:vda1
I0321 00:52:14.456853 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 00:52:14.457663 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 00:52:14.457671 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 00:52:14.457676 543705 custom_config.go:64] query custom config with name: gpu
E0321 00:52:15.456792 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 00:52:15.456801 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:52:16.457945 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 00:52:16.457951 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 00:52:16.457999 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:52:16.458018 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:52:16.472331 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:52:23.410378 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:52:23.410396 543705 memory.go:184] no items to output this cycle
I0321 00:52:23.410400 543705 cpu.go:275] no items to output this cycle
I0321 00:52:26.937685 543705 disk_info.go:125] begin check local disk info of client
I0321 00:52:26.940210 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:52:26.940216 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002afb80 0xc0002afbc0]
E0321 00:52:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:52:33.409802 543705 memory.go:184] no items to output this cycle
I0321 00:52:33.409815 543705 cpu.go:275] no items to output this cycle
E0321 00:52:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:52:43.409801 543705 memory.go:191] Add success.
I0321 00:52:43.409802 543705 cpu.go:282] Add success.
I0321 00:52:43.419949 543705 net.go:648] Add success.
I0321 00:52:43.422850 543705 net.go:770] primary dev: ETH0
I0321 00:52:43.422866 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:52:43.422879 543705 net.go:698] Add success.
I0321 00:52:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:52:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:52:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:52:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:52:53.409777 543705 memory.go:184] no items to output this cycle
I0321 00:52:53.409789 543705 cpu.go:275] no items to output this cycle
E0321 00:53:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:53:03.409803 543705 memory.go:184] no items to output this cycle
I0321 00:53:03.409817 543705 cpu.go:275] no items to output this cycle
E0321 00:53:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:53:13.409806 543705 memory.go:191] Add success.
I0321 00:53:13.409807 543705 cpu.go:282] Add success.
W0321 00:53:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:53:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:53:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:53:13.420135 543705 net.go:648] Add success.
I0321 00:53:13.422764 543705 net.go:770] primary dev: ETH0
I0321 00:53:13.422778 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:53:13.422791 543705 net.go:698] Add success.
I0321 00:53:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:53:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:53:14.455194 543705 disk_worker.go:708] disk space is not compliant
W0321 00:53:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:53:14.456600 543705 disk_worker.go:494] system disk:vda1
I0321 00:53:14.456634 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:53:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:53:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:53:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:53:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:53:16.472377 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:53:23.409856 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:53:23.409878 543705 memory.go:184] no items to output this cycle
I0321 00:53:23.409948 543705 cpu.go:275] no items to output this cycle
I0321 00:53:26.941671 543705 disk_info.go:125] begin check local disk info of client
I0321 00:53:26.944161 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:53:26.944167 543705 disk_info.go:196] parse disk info done, disk is : [0xc000321c00 0xc000321c40]
E0321 00:53:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:53:33.409774 543705 memory.go:184] no items to output this cycle
I0321 00:53:33.409809 543705 cpu.go:275] no items to output this cycle
E0321 00:53:43.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:53:43.409795 543705 memory.go:191] Add success.
I0321 00:53:43.409816 543705 cpu.go:282] Add success.
I0321 00:53:43.420034 543705 net.go:648] Add success.
I0321 00:53:43.422576 543705 net.go:770] primary dev: ETH0
I0321 00:53:43.422590 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:53:43.422603 543705 net.go:698] Add success.
I0321 00:53:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:53:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:53:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:53:53.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:53:53.409796 543705 memory.go:184] no items to output this cycle
I0321 00:53:53.409798 543705 cpu.go:275] no items to output this cycle
E0321 00:54:03.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:54:03.409777 543705 memory.go:184] no items to output this cycle
I0321 00:54:03.409817 543705 cpu.go:275] no items to output this cycle
E0321 00:54:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:54:13.409798 543705 memory.go:191] Add success.
I0321 00:54:13.409824 543705 cpu.go:282] Add success.
W0321 00:54:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:54:13.409838 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:54:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:54:13.420263 543705 net.go:648] Add success.
I0321 00:54:13.423121 543705 net.go:770] primary dev: ETH0
I0321 00:54:13.423135 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:54:13.423150 543705 net.go:698] Add success.
I0321 00:54:13.463050 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ed66baa5-03b1-48b8-82ec-219510f6e62c","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 00:54:13.463082 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 00:54:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:54:14.455152 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:54:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0321 00:54:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:54:14.456539 543705 disk_worker.go:494] system disk:vda1
I0321 00:54:14.456583 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:54:15.455614 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:54:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:54:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:54:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:54:16.472494 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:54:23.409894 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:54:23.409911 543705 cpu.go:275] no items to output this cycle
I0321 00:54:23.409912 543705 memory.go:184] no items to output this cycle
I0321 00:54:26.945668 543705 disk_info.go:125] begin check local disk info of client
I0321 00:54:26.948245 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:54:26.948252 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa000 0xc0001aa040]
E0321 00:54:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:54:33.409802 543705 memory.go:184] no items to output this cycle
I0321 00:54:33.409818 543705 cpu.go:275] no items to output this cycle
I0321 00:54:38.676231 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 00:54:38.676237 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 00:54:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:54:43.410525 543705 memory.go:191] Add success.
I0321 00:54:43.409810 543705 cpu.go:282] Add success.
I0321 00:54:43.420279 543705 net.go:648] Add success.
I0321 00:54:43.422931 543705 net.go:770] primary dev: ETH0
I0321 00:54:43.422946 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:54:43.422960 543705 net.go:698] Add success.
I0321 00:54:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:54:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:54:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:54:53.410264 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:54:53.410282 543705 memory.go:184] no items to output this cycle
I0321 00:54:53.410283 543705 cpu.go:275] no items to output this cycle
E0321 00:55:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:55:03.409771 543705 memory.go:184] no items to output this cycle
I0321 00:55:03.409785 543705 cpu.go:275] no items to output this cycle
E0321 00:55:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:55:13.409796 543705 memory.go:191] Add success.
I0321 00:55:13.409810 543705 cpu.go:282] Add success.
W0321 00:55:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:55:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:55:13.409856 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:55:13.420060 543705 net.go:648] Add success.
I0321 00:55:13.422732 543705 net.go:770] primary dev: ETH0
I0321 00:55:13.422745 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:55:13.422758 543705 net.go:698] Add success.
I0321 00:55:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:55:14.455144 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:55:14.455154 543705 disk_worker.go:708] disk space is not compliant
W0321 00:55:14.455157 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:55:14.456497 543705 disk_worker.go:494] system disk:vda1
I0321 00:55:14.456540 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:55:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:55:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:55:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:55:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:55:16.472386 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:55:23.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:55:23.409802 543705 memory.go:184] no items to output this cycle
I0321 00:55:23.409809 543705 cpu.go:275] no items to output this cycle
I0321 00:55:26.949673 543705 disk_info.go:125] begin check local disk info of client
I0321 00:55:26.952225 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:55:26.952231 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004c8c00 0xc0004c8c40]
E0321 00:55:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:55:33.409770 543705 memory.go:184] no items to output this cycle
I0321 00:55:33.409789 543705 cpu.go:275] no items to output this cycle
E0321 00:55:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:55:43.409774 543705 memory.go:191] Add success.
I0321 00:55:43.409787 543705 cpu.go:282] Add success.
I0321 00:55:43.420025 543705 net.go:648] Add success.
I0321 00:55:43.420998 543705 net.go:770] primary dev: ETH0
I0321 00:55:43.421012 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:55:43.421023 543705 net.go:698] Add success.
I0321 00:55:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:55:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:55:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:55:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:55:53.409769 543705 memory.go:184] no items to output this cycle
I0321 00:55:53.409791 543705 cpu.go:275] no items to output this cycle
E0321 00:56:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:56:03.409768 543705 memory.go:184] no items to output this cycle
I0321 00:56:03.409790 543705 cpu.go:275] no items to output this cycle
E0321 00:56:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:56:13.409818 543705 memory.go:191] Add success.
I0321 00:56:13.409824 543705 cpu.go:282] Add success.
W0321 00:56:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:56:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:56:13.409871 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:56:13.420130 543705 net.go:648] Add success.
I0321 00:56:13.422793 543705 net.go:770] primary dev: ETH0
I0321 00:56:13.422810 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:56:13.422823 543705 net.go:698] Add success.
I0321 00:56:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:56:14.455136 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:56:14.455146 543705 disk_worker.go:708] disk space is not compliant
W0321 00:56:14.455149 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:56:14.456494 543705 disk_worker.go:494] system disk:vda1
I0321 00:56:14.456536 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:56:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:56:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:56:16.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:56:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:56:16.472396 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:56:23.410394 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:56:23.410414 543705 memory.go:184] no items to output this cycle
I0321 00:56:23.410418 543705 cpu.go:275] no items to output this cycle
I0321 00:56:26.953666 543705 disk_info.go:125] begin check local disk info of client
I0321 00:56:26.956198 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:56:26.956205 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a48c0 0xc0004a4900]
E0321 00:56:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:56:33.409766 543705 memory.go:184] no items to output this cycle
I0321 00:56:33.409811 543705 cpu.go:275] no items to output this cycle
E0321 00:56:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:56:43.409816 543705 memory.go:191] Add success.
I0321 00:56:43.409824 543705 cpu.go:282] Add success.
I0321 00:56:43.420073 543705 net.go:648] Add success.
I0321 00:56:43.422778 543705 net.go:770] primary dev: ETH0
I0321 00:56:43.422789 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:56:43.422801 543705 net.go:698] Add success.
I0321 00:56:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:56:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:56:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:56:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:56:53.409794 543705 memory.go:184] no items to output this cycle
I0321 00:56:53.409807 543705 cpu.go:275] no items to output this cycle
E0321 00:57:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:57:03.409777 543705 memory.go:184] no items to output this cycle
I0321 00:57:03.409805 543705 cpu.go:275] no items to output this cycle
E0321 00:57:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:57:13.409821 543705 memory.go:191] Add success.
I0321 00:57:13.409827 543705 cpu.go:282] Add success.
W0321 00:57:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:57:13.409873 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:57:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:57:13.420211 543705 net.go:648] Add success.
I0321 00:57:13.422923 543705 net.go:770] primary dev: ETH0
I0321 00:57:13.422938 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:57:13.422953 543705 net.go:698] Add success.
I0321 00:57:13.429219 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0321 00:57:13.453393 543705 event_worker.go:152] Polling the log file for events...
I0321 00:57:13.463713 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e69da210-6d83-49f0-a382-6e9dd2cb686f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 00:57:13.463748 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0321 00:57:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:57:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0321 00:57:14.455185 543705 disk_worker.go:728] disk inode is not compliant
E0321 00:57:14.456954 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 00:57:14.456963 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 00:57:14.456981 543705 custom_config.go:64] query custom config with name: gpu
I0321 00:57:14.457030 543705 disk_worker.go:494] system disk:vda1
I0321 00:57:14.457083 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 00:57:15.456798 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 00:57:15.456807 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:57:16.457946 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 00:57:16.457946 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 00:57:16.458011 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:57:16.458030 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:57:16.472363 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:57:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:57:23.409806 543705 memory.go:184] no items to output this cycle
I0321 00:57:23.409813 543705 cpu.go:275] no items to output this cycle
I0321 00:57:26.957673 543705 disk_info.go:125] begin check local disk info of client
I0321 00:57:26.960177 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:57:26.960183 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003fed80 0xc0003fedc0]
E0321 00:57:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:57:33.409766 543705 memory.go:184] no items to output this cycle
I0321 00:57:33.409787 543705 cpu.go:275] no items to output this cycle
I0321 00:57:38.677089 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 00:57:38.677095 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 00:57:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:57:43.410538 543705 memory.go:191] Add success.
I0321 00:57:43.409818 543705 cpu.go:282] Add success.
I0321 00:57:43.420299 543705 net.go:648] Add success.
I0321 00:57:43.422967 543705 net.go:770] primary dev: ETH0
I0321 00:57:43.422979 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:57:43.422991 543705 net.go:698] Add success.
I0321 00:57:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:57:46.458063 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:57:46.458089 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:57:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:57:53.409795 543705 memory.go:184] no items to output this cycle
I0321 00:57:53.409808 543705 cpu.go:275] no items to output this cycle
E0321 00:58:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:58:03.409770 543705 memory.go:184] no items to output this cycle
I0321 00:58:03.409796 543705 cpu.go:275] no items to output this cycle
E0321 00:58:13.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:58:13.409793 543705 memory.go:191] Add success.
I0321 00:58:13.409796 543705 cpu.go:282] Add success.
W0321 00:58:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:58:13.412446 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:58:13.412450 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:58:13.420162 543705 net.go:648] Add success.
I0321 00:58:13.421784 543705 net.go:770] primary dev: ETH0
I0321 00:58:13.421797 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:58:13.421810 543705 net.go:698] Add success.
I0321 00:58:14.454983 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:58:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:58:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0321 00:58:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:58:14.456593 543705 disk_worker.go:494] system disk:vda1
I0321 00:58:14.456625 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:58:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:58:16.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:58:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:58:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:58:16.472440 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:58:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:58:23.409785 543705 memory.go:184] no items to output this cycle
I0321 00:58:23.409805 543705 cpu.go:275] no items to output this cycle
I0321 00:58:26.961675 543705 disk_info.go:125] begin check local disk info of client
I0321 00:58:26.964218 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:58:26.964224 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e4440 0xc0003e4480]
E0321 00:58:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:58:33.409801 543705 memory.go:184] no items to output this cycle
I0321 00:58:33.409812 543705 cpu.go:275] no items to output this cycle
E0321 00:58:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:58:43.409789 543705 memory.go:191] Add success.
I0321 00:58:43.409794 543705 cpu.go:282] Add success.
I0321 00:58:43.419706 543705 net.go:648] Add success.
I0321 00:58:43.422440 543705 net.go:770] primary dev: ETH0
I0321 00:58:43.422453 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:58:43.422464 543705 net.go:698] Add success.
I0321 00:58:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:58:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:58:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:58:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:58:53.409802 543705 memory.go:184] no items to output this cycle
I0321 00:58:53.409810 543705 cpu.go:275] no items to output this cycle
E0321 00:59:03.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:59:03.409822 543705 memory.go:184] no items to output this cycle
I0321 00:59:03.409827 543705 cpu.go:275] no items to output this cycle
E0321 00:59:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:59:13.409794 543705 memory.go:191] Add success.
I0321 00:59:13.409798 543705 cpu.go:282] Add success.
W0321 00:59:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 00:59:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 00:59:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 00:59:13.420053 543705 net.go:648] Add success.
I0321 00:59:13.422846 543705 net.go:770] primary dev: ETH0
I0321 00:59:13.422859 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:59:13.422870 543705 net.go:698] Add success.
I0321 00:59:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0321 00:59:14.455178 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 00:59:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0321 00:59:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0321 00:59:14.456569 543705 disk_worker.go:494] system disk:vda1
I0321 00:59:14.456616 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 00:59:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 00:59:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:59:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:59:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0321 00:59:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0321 00:59:23.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:59:23.409807 543705 memory.go:184] no items to output this cycle
I0321 00:59:23.409813 543705 cpu.go:275] no items to output this cycle
I0321 00:59:26.965672 543705 disk_info.go:125] begin check local disk info of client
I0321 00:59:26.968186 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 00:59:26.968192 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000ec500 0xc0000ec540]
E0321 00:59:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:59:33.409772 543705 memory.go:184] no items to output this cycle
I0321 00:59:33.409781 543705 cpu.go:275] no items to output this cycle
E0321 00:59:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:59:43.409789 543705 memory.go:191] Add success.
I0321 00:59:43.409790 543705 cpu.go:282] Add success.
I0321 00:59:43.419845 543705 net.go:648] Add success.
I0321 00:59:43.422619 543705 net.go:770] primary dev: ETH0
I0321 00:59:43.422631 543705 net.go:802] Send network stats successfully!,count is 6
I0321 00:59:43.422667 543705 net.go:698] Add success.
I0321 00:59:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 00:59:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 00:59:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0321 00:59:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 00:59:53.409765 543705 memory.go:184] no items to output this cycle
I0321 00:59:53.409795 543705 cpu.go:275] no items to output this cycle
E0321 01:00:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:00:03.409808 543705 memory.go:184] no items to output this cycle
I0321 01:00:03.409818 543705 cpu.go:275] no items to output this cycle
E0321 01:00:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:00:13.409791 543705 memory.go:191] Add success.
W0321 01:00:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0321 01:00:13.409822 543705 cpu.go:282] Add success.
W0321 01:00:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:00:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:00:13.420540 543705 net.go:648] Add success.
I0321 01:00:13.423162 543705 net.go:770] primary dev: ETH0
I0321 01:00:13.423186 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:00:13.423199 543705 net.go:698] Add success.
I0321 01:00:13.472407 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"12e76eea-e04b-4b39-8174-3c9ae88a3693","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 01:00:13.472440 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 01:00:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:00:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:00:14.455198 543705 disk_worker.go:708] disk space is not compliant
W0321 01:00:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:00:14.456765 543705 disk_worker.go:494] system disk:vda1
I0321 01:00:14.456793 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:00:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:00:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:00:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:00:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:00:16.472495 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:00:23.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:00:23.409810 543705 memory.go:184] no items to output this cycle
I0321 01:00:23.409820 543705 cpu.go:275] no items to output this cycle
I0321 01:00:26.969674 543705 disk_info.go:125] begin check local disk info of client
I0321 01:00:26.972272 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:00:26.972278 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e2640 0xc0003e2680]
E0321 01:00:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:00:33.409767 543705 memory.go:184] no items to output this cycle
I0321 01:00:33.409795 543705 cpu.go:275] no items to output this cycle
I0321 01:00:38.677732 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 01:00:38.677739 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 01:00:43.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:00:43.410717 543705 memory.go:191] Add success.
I0321 01:00:43.409826 543705 cpu.go:282] Add success.
I0321 01:00:43.420447 543705 net.go:648] Add success.
I0321 01:00:43.423138 543705 net.go:770] primary dev: ETH0
I0321 01:00:43.423151 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:00:43.423164 543705 net.go:698] Add success.
I0321 01:00:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:00:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:00:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:00:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:00:53.409889 543705 memory.go:184] no items to output this cycle
I0321 01:00:53.409892 543705 cpu.go:275] no items to output this cycle
E0321 01:01:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:01:03.409768 543705 memory.go:184] no items to output this cycle
I0321 01:01:03.409807 543705 cpu.go:275] no items to output this cycle
E0321 01:01:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:01:13.409827 543705 memory.go:191] Add success.
I0321 01:01:13.409827 543705 cpu.go:282] Add success.
W0321 01:01:13.409859 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:01:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:01:13.409874 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:01:13.420394 543705 net.go:648] Add success.
I0321 01:01:13.423146 543705 net.go:770] primary dev: ETH0
I0321 01:01:13.423159 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:01:13.423172 543705 net.go:698] Add success.
I0321 01:01:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:01:14.455137 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:01:14.455223 543705 disk_worker.go:708] disk space is not compliant
W0321 01:01:14.455227 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:01:14.456616 543705 disk_worker.go:494] system disk:vda1
I0321 01:01:14.456649 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:01:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:01:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:01:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:01:16.458053 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:01:16.472372 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:01:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:01:23.409804 543705 memory.go:184] no items to output this cycle
I0321 01:01:23.409813 543705 cpu.go:275] no items to output this cycle
I0321 01:01:26.973676 543705 disk_info.go:125] begin check local disk info of client
I0321 01:01:26.976280 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:01:26.976288 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fba00 0xc0001fba40]
E0321 01:01:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:01:33.409799 543705 memory.go:184] no items to output this cycle
I0321 01:01:33.409809 543705 cpu.go:275] no items to output this cycle
E0321 01:01:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:01:43.409781 543705 memory.go:191] Add success.
I0321 01:01:43.409806 543705 cpu.go:282] Add success.
I0321 01:01:43.419912 543705 net.go:648] Add success.
I0321 01:01:43.422935 543705 net.go:770] primary dev: ETH0
I0321 01:01:43.422948 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:01:43.422960 543705 net.go:698] Add success.
I0321 01:01:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:01:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:01:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:01:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:01:53.409790 543705 memory.go:184] no items to output this cycle
I0321 01:01:53.409804 543705 cpu.go:275] no items to output this cycle
E0321 01:02:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:02:03.409779 543705 cpu.go:275] no items to output this cycle
I0321 01:02:03.409781 543705 memory.go:184] no items to output this cycle
E0321 01:02:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:02:13.409825 543705 memory.go:191] Add success.
I0321 01:02:13.409832 543705 cpu.go:282] Add success.
W0321 01:02:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:02:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:02:13.409878 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:02:13.420255 543705 net.go:648] Add success.
I0321 01:02:13.423225 543705 net.go:770] primary dev: ETH0
I0321 01:02:13.423238 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:02:13.423250 543705 net.go:698] Add success.
W0321 01:02:14.455153 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:02:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0321 01:02:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:02:14.456488 543705 disk_worker.go:494] system disk:vda1
I0321 01:02:14.456514 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 01:02:14.456952 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 01:02:14.456961 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 01:02:14.456967 543705 custom_config.go:64] query custom config with name: gpu
E0321 01:02:15.456833 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 01:02:15.456842 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:02:16.457926 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 01:02:16.457925 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 01:02:16.457981 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:02:16.458003 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:02:16.472337 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:02:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:02:23.409781 543705 memory.go:184] no items to output this cycle
I0321 01:02:23.409788 543705 cpu.go:275] no items to output this cycle
I0321 01:02:26.977675 543705 disk_info.go:125] begin check local disk info of client
I0321 01:02:26.980201 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:02:26.980207 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000eca80 0xc0000ecac0]
E0321 01:02:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:02:33.409789 543705 memory.go:184] no items to output this cycle
I0321 01:02:33.409801 543705 cpu.go:275] no items to output this cycle
E0321 01:02:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:02:43.409801 543705 memory.go:191] Add success.
I0321 01:02:43.409809 543705 cpu.go:282] Add success.
I0321 01:02:43.419890 543705 net.go:648] Add success.
I0321 01:02:43.422616 543705 net.go:770] primary dev: ETH0
I0321 01:02:43.422630 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:02:43.422643 543705 net.go:698] Add success.
I0321 01:02:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:02:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:02:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:02:53.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:02:53.409794 543705 memory.go:184] no items to output this cycle
I0321 01:02:53.409804 543705 cpu.go:275] no items to output this cycle
E0321 01:03:03.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:03:03.409782 543705 memory.go:184] no items to output this cycle
I0321 01:03:03.409785 543705 cpu.go:275] no items to output this cycle
I0321 01:03:13.409965 543705 cpu.go:282] Add success.
E0321 01:03:13.410058 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:03:13.410078 543705 memory.go:191] Add success.
W0321 01:03:13.410108 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:03:13.410120 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:03:13.410123 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:03:13.419710 543705 net.go:648] Add success.
I0321 01:03:13.422548 543705 net.go:770] primary dev: ETH0
I0321 01:03:13.422560 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:03:13.422572 543705 net.go:698] Add success.
I0321 01:03:13.464103 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"470dd185-8a6f-4133-b1f0-716b5b9eb4c6","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 01:03:13.464134 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 01:03:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:03:14.455083 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:03:14.455145 543705 disk_worker.go:708] disk space is not compliant
W0321 01:03:14.455148 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:03:14.456473 543705 disk_worker.go:494] system disk:vda1
I0321 01:03:14.456517 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:03:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:03:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:03:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:03:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:03:16.472439 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:03:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:03:23.409786 543705 memory.go:184] no items to output this cycle
I0321 01:03:23.409789 543705 cpu.go:275] no items to output this cycle
I0321 01:03:26.981673 543705 disk_info.go:125] begin check local disk info of client
I0321 01:03:26.984144 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:03:26.984150 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000ec580 0xc0000ec5c0]
E0321 01:03:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:03:33.409797 543705 memory.go:184] no items to output this cycle
I0321 01:03:33.409811 543705 cpu.go:275] no items to output this cycle
I0321 01:03:38.679090 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 01:03:38.679096 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 01:03:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:03:43.410620 543705 memory.go:191] Add success.
I0321 01:03:43.409810 543705 cpu.go:282] Add success.
I0321 01:03:43.420313 543705 net.go:648] Add success.
I0321 01:03:43.423253 543705 net.go:770] primary dev: ETH0
I0321 01:03:43.423267 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:03:43.423279 543705 net.go:698] Add success.
I0321 01:03:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:03:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:03:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:03:53.410358 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:03:53.410376 543705 memory.go:184] no items to output this cycle
I0321 01:03:53.410414 543705 cpu.go:275] no items to output this cycle
E0321 01:04:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:04:03.409775 543705 memory.go:184] no items to output this cycle
I0321 01:04:03.409790 543705 cpu.go:275] no items to output this cycle
E0321 01:04:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:04:13.409796 543705 memory.go:191] Add success.
I0321 01:04:13.409823 543705 cpu.go:282] Add success.
W0321 01:04:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:04:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:04:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:04:13.420169 543705 net.go:648] Add success.
I0321 01:04:13.422719 543705 net.go:770] primary dev: ETH0
I0321 01:04:13.422733 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:04:13.422748 543705 net.go:698] Add success.
I0321 01:04:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:04:14.455107 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:04:14.455188 543705 disk_worker.go:708] disk space is not compliant
W0321 01:04:14.455191 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:04:14.456596 543705 disk_worker.go:494] system disk:vda1
I0321 01:04:14.456626 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:04:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:04:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:04:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:04:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:04:16.472400 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:04:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:04:23.409779 543705 memory.go:184] no items to output this cycle
I0321 01:04:23.409796 543705 cpu.go:275] no items to output this cycle
I0321 01:04:26.985676 543705 disk_info.go:125] begin check local disk info of client
I0321 01:04:26.988249 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:04:26.988255 543705 disk_info.go:196] parse disk info done, disk is : [0xc000261300 0xc000261340]
E0321 01:04:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:04:33.409776 543705 memory.go:184] no items to output this cycle
I0321 01:04:33.409777 543705 cpu.go:275] no items to output this cycle
E0321 01:04:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:04:43.409785 543705 memory.go:191] Add success.
I0321 01:04:43.409809 543705 cpu.go:282] Add success.
I0321 01:04:43.419957 543705 net.go:648] Add success.
I0321 01:04:43.423111 543705 net.go:770] primary dev: ETH0
I0321 01:04:43.423126 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:04:43.423144 543705 net.go:698] Add success.
I0321 01:04:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:04:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:04:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:04:53.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:04:53.409759 543705 memory.go:184] no items to output this cycle
I0321 01:04:53.409796 543705 cpu.go:275] no items to output this cycle
E0321 01:05:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:05:03.409779 543705 memory.go:184] no items to output this cycle
I0321 01:05:03.409805 543705 cpu.go:275] no items to output this cycle
E0321 01:05:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:05:13.409818 543705 memory.go:191] Add success.
I0321 01:05:13.409825 543705 cpu.go:282] Add success.
W0321 01:05:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:05:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:05:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:05:13.420286 543705 net.go:648] Add success.
I0321 01:05:13.423093 543705 net.go:770] primary dev: ETH0
I0321 01:05:13.423107 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:05:13.423118 543705 net.go:698] Add success.
I0321 01:05:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:05:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:05:14.455183 543705 disk_worker.go:708] disk space is not compliant
W0321 01:05:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:05:14.456593 543705 disk_worker.go:494] system disk:vda1
I0321 01:05:14.456621 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:05:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:05:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:05:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:05:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:05:16.472408 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:05:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:05:23.409792 543705 memory.go:184] no items to output this cycle
I0321 01:05:23.409800 543705 cpu.go:275] no items to output this cycle
I0321 01:05:26.989673 543705 disk_info.go:125] begin check local disk info of client
I0321 01:05:26.992200 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:05:26.992206 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001faa80 0xc0001faac0]
E0321 01:05:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:05:33.409803 543705 memory.go:184] no items to output this cycle
I0321 01:05:33.409817 543705 cpu.go:275] no items to output this cycle
E0321 01:05:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:05:43.409790 543705 memory.go:191] Add success.
I0321 01:05:43.409791 543705 cpu.go:282] Add success.
I0321 01:05:43.419835 543705 net.go:648] Add success.
I0321 01:05:43.422254 543705 net.go:770] primary dev: ETH0
I0321 01:05:43.422269 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:05:43.422283 543705 net.go:698] Add success.
I0321 01:05:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:05:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:05:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:05:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:05:53.409776 543705 memory.go:184] no items to output this cycle
I0321 01:05:53.409778 543705 cpu.go:275] no items to output this cycle
E0321 01:06:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:06:03.409800 543705 memory.go:184] no items to output this cycle
I0321 01:06:03.409813 543705 cpu.go:275] no items to output this cycle
E0321 01:06:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:06:13.409823 543705 memory.go:191] Add success.
I0321 01:06:13.409828 543705 cpu.go:282] Add success.
W0321 01:06:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:06:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:06:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:06:13.420342 543705 net.go:648] Add success.
I0321 01:06:13.423471 543705 net.go:770] primary dev: ETH0
I0321 01:06:13.423484 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:06:13.423495 543705 net.go:698] Add success.
I0321 01:06:13.477635 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b76ad627-0e18-4fb4-bdf0-4d291502543f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 01:06:13.477675 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 01:06:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:06:14.455165 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:06:14.455176 543705 disk_worker.go:708] disk space is not compliant
W0321 01:06:14.455178 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:06:14.456520 543705 disk_worker.go:494] system disk:vda1
I0321 01:06:14.456564 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:06:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:06:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:06:16.458027 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:06:16.458047 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:06:16.472393 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:06:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:06:23.409780 543705 memory.go:184] no items to output this cycle
I0321 01:06:23.409794 543705 cpu.go:275] no items to output this cycle
I0321 01:06:26.993675 543705 disk_info.go:125] begin check local disk info of client
I0321 01:06:26.996223 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:06:26.996229 543705 disk_info.go:196] parse disk info done, disk is : [0xc000357b00 0xc000357b40]
E0321 01:06:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:06:33.409775 543705 memory.go:184] no items to output this cycle
I0321 01:06:33.409775 543705 cpu.go:275] no items to output this cycle
I0321 01:06:38.679240 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 01:06:38.679246 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 01:06:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:06:43.410717 543705 memory.go:191] Add success.
I0321 01:06:43.409789 543705 cpu.go:282] Add success.
I0321 01:06:43.420500 543705 net.go:648] Add success.
I0321 01:06:43.423044 543705 net.go:770] primary dev: ETH0
I0321 01:06:43.423058 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:06:43.423073 543705 net.go:698] Add success.
I0321 01:06:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:06:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:06:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:06:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:06:53.409793 543705 memory.go:184] no items to output this cycle
I0321 01:06:53.409807 543705 cpu.go:275] no items to output this cycle
E0321 01:07:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:07:03.409777 543705 memory.go:184] no items to output this cycle
I0321 01:07:03.409800 543705 cpu.go:275] no items to output this cycle
E0321 01:07:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:07:13.409821 543705 memory.go:191] Add success.
I0321 01:07:13.409822 543705 cpu.go:282] Add success.
W0321 01:07:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:07:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:07:13.409864 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:07:13.420540 543705 net.go:648] Add success.
I0321 01:07:13.423306 543705 net.go:770] primary dev: ETH0
I0321 01:07:13.423318 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:07:13.423330 543705 net.go:698] Add success.
I0321 01:07:13.452774 543705 event_worker.go:152] Polling the log file for events...
W0321 01:07:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:07:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0321 01:07:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:07:14.456787 543705 disk_worker.go:494] system disk:vda1
I0321 01:07:14.456826 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 01:07:14.457169 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 01:07:14.457177 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 01:07:14.457181 543705 custom_config.go:64] query custom config with name: gpu
E0321 01:07:15.456823 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 01:07:15.456832 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:07:16.457913 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 01:07:16.457912 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 01:07:16.457968 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:07:16.457988 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:07:16.472323 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:07:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:07:23.409797 543705 memory.go:184] no items to output this cycle
I0321 01:07:23.409807 543705 cpu.go:275] no items to output this cycle
I0321 01:07:26.997673 543705 disk_info.go:125] begin check local disk info of client
I0321 01:07:27.000402 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:07:27.000407 543705 disk_info.go:196] parse disk info done, disk is : [0xc00056de00 0xc00056de40]
E0321 01:07:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:07:33.409784 543705 memory.go:184] no items to output this cycle
I0321 01:07:33.409786 543705 cpu.go:275] no items to output this cycle
E0321 01:07:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:07:43.409784 543705 memory.go:191] Add success.
I0321 01:07:43.409804 543705 cpu.go:282] Add success.
I0321 01:07:43.419888 543705 net.go:648] Add success.
I0321 01:07:43.422540 543705 net.go:770] primary dev: ETH0
I0321 01:07:43.422555 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:07:43.422570 543705 net.go:698] Add success.
I0321 01:07:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:07:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:07:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:07:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:07:53.409790 543705 memory.go:184] no items to output this cycle
I0321 01:07:53.409804 543705 cpu.go:275] no items to output this cycle
E0321 01:08:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:08:03.409810 543705 memory.go:184] no items to output this cycle
I0321 01:08:03.409826 543705 cpu.go:275] no items to output this cycle
E0321 01:08:13.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:08:13.409831 543705 memory.go:191] Add success.
I0321 01:08:13.409832 543705 cpu.go:282] Add success.
W0321 01:08:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:08:13.409873 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:08:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:08:13.420408 543705 net.go:648] Add success.
I0321 01:08:13.423648 543705 net.go:770] primary dev: ETH0
I0321 01:08:13.423660 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:08:13.423672 543705 net.go:698] Add success.
I0321 01:08:14.454950 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:08:14.455082 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:08:14.455160 543705 disk_worker.go:708] disk space is not compliant
W0321 01:08:14.455163 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:08:14.456549 543705 disk_worker.go:494] system disk:vda1
I0321 01:08:14.456579 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:08:15.455613 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:08:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:08:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:08:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:08:16.472394 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:08:23.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:08:23.409818 543705 memory.go:184] no items to output this cycle
I0321 01:08:23.409827 543705 cpu.go:275] no items to output this cycle
I0321 01:08:27.001680 543705 disk_info.go:125] begin check local disk info of client
I0321 01:08:27.004244 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:08:27.004251 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003041c0 0xc000304200]
E0321 01:08:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:08:33.409780 543705 memory.go:184] no items to output this cycle
I0321 01:08:33.409785 543705 cpu.go:275] no items to output this cycle
E0321 01:08:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:08:43.409798 543705 memory.go:191] Add success.
I0321 01:08:43.409797 543705 cpu.go:282] Add success.
I0321 01:08:43.419970 543705 net.go:648] Add success.
I0321 01:08:43.422642 543705 net.go:770] primary dev: ETH0
I0321 01:08:43.422655 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:08:43.422667 543705 net.go:698] Add success.
I0321 01:08:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:08:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:08:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:08:53.410391 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:08:53.410410 543705 memory.go:184] no items to output this cycle
I0321 01:08:53.410410 543705 cpu.go:275] no items to output this cycle
E0321 01:09:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:09:03.409781 543705 memory.go:184] no items to output this cycle
I0321 01:09:03.409805 543705 cpu.go:275] no items to output this cycle
E0321 01:09:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:09:13.409802 543705 memory.go:191] Add success.
I0321 01:09:13.409819 543705 cpu.go:282] Add success.
W0321 01:09:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:09:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:09:13.409845 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:09:13.420452 543705 net.go:648] Add success.
I0321 01:09:13.423240 543705 net.go:770] primary dev: ETH0
I0321 01:09:13.423253 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:09:13.423267 543705 net.go:698] Add success.
I0321 01:09:13.463301 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e0dfe152-c528-4ba8-a264-fcec8e9845e4","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 01:09:13.463334 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 01:09:14.454968 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:09:14.455172 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:09:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0321 01:09:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:09:14.456683 543705 disk_worker.go:494] system disk:vda1
I0321 01:09:14.456710 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:09:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:09:16.457964 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:09:16.458023 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:09:16.458042 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:09:16.472369 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:09:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:09:23.409808 543705 memory.go:184] no items to output this cycle
I0321 01:09:23.409819 543705 cpu.go:275] no items to output this cycle
I0321 01:09:27.005671 543705 disk_info.go:125] begin check local disk info of client
I0321 01:09:27.008156 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:09:27.008162 543705 disk_info.go:196] parse disk info done, disk is : [0xc00056cbc0 0xc00056cc00]
E0321 01:09:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:09:33.409790 543705 cpu.go:275] no items to output this cycle
I0321 01:09:33.409797 543705 memory.go:184] no items to output this cycle
I0321 01:09:38.680095 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 01:09:38.680102 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 01:09:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:09:43.410688 543705 memory.go:191] Add success.
I0321 01:09:43.409789 543705 cpu.go:282] Add success.
I0321 01:09:43.420384 543705 net.go:648] Add success.
I0321 01:09:43.423433 543705 net.go:770] primary dev: ETH0
I0321 01:09:43.423446 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:09:43.423459 543705 net.go:698] Add success.
I0321 01:09:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:09:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:09:46.458088 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:09:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:09:53.409771 543705 memory.go:184] no items to output this cycle
I0321 01:09:53.409792 543705 cpu.go:275] no items to output this cycle
E0321 01:10:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:10:03.409772 543705 memory.go:184] no items to output this cycle
I0321 01:10:03.409786 543705 cpu.go:275] no items to output this cycle
E0321 01:10:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:10:13.409804 543705 memory.go:191] Add success.
I0321 01:10:13.409805 543705 cpu.go:282] Add success.
W0321 01:10:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:10:13.409843 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:10:13.409846 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:10:13.419725 543705 net.go:648] Add success.
I0321 01:10:13.422482 543705 net.go:770] primary dev: ETH0
I0321 01:10:13.422501 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:10:13.422515 543705 net.go:698] Add success.
I0321 01:10:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:10:14.455135 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:10:14.455216 543705 disk_worker.go:708] disk space is not compliant
W0321 01:10:14.455220 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:10:14.456616 543705 disk_worker.go:494] system disk:vda1
I0321 01:10:14.456647 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:10:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:10:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:10:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:10:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:10:16.472386 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:10:23.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:10:23.409806 543705 memory.go:184] no items to output this cycle
I0321 01:10:23.409814 543705 cpu.go:275] no items to output this cycle
I0321 01:10:27.009673 543705 disk_info.go:125] begin check local disk info of client
I0321 01:10:27.012245 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:10:27.012251 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb900 0xc0001fb940]
E0321 01:10:33.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:10:33.409757 543705 memory.go:184] no items to output this cycle
I0321 01:10:33.409794 543705 cpu.go:275] no items to output this cycle
E0321 01:10:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:10:43.409824 543705 memory.go:191] Add success.
I0321 01:10:43.409832 543705 cpu.go:282] Add success.
I0321 01:10:43.420151 543705 net.go:648] Add success.
I0321 01:10:43.422785 543705 net.go:770] primary dev: ETH0
I0321 01:10:43.422800 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:10:43.422814 543705 net.go:698] Add success.
I0321 01:10:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:10:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:10:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:10:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:10:53.409780 543705 memory.go:184] no items to output this cycle
I0321 01:10:53.409801 543705 cpu.go:275] no items to output this cycle
E0321 01:11:03.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:11:03.409761 543705 memory.go:184] no items to output this cycle
I0321 01:11:03.409794 543705 cpu.go:275] no items to output this cycle
E0321 01:11:13.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:11:13.409823 543705 memory.go:191] Add success.
I0321 01:11:13.409823 543705 cpu.go:282] Add success.
W0321 01:11:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:11:13.409864 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:11:13.409867 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:11:13.420578 543705 net.go:648] Add success.
I0321 01:11:13.423844 543705 net.go:770] primary dev: ETH0
I0321 01:11:13.423863 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:11:13.423878 543705 net.go:698] Add success.
I0321 01:11:14.454954 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:11:14.455180 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:11:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0321 01:11:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:11:14.456584 543705 disk_worker.go:494] system disk:vda1
I0321 01:11:14.456611 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:11:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:11:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:11:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:11:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:11:16.472408 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:11:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:11:23.409772 543705 memory.go:184] no items to output this cycle
I0321 01:11:23.409793 543705 cpu.go:275] no items to output this cycle
I0321 01:11:27.013679 543705 disk_info.go:125] begin check local disk info of client
I0321 01:11:27.016193 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:11:27.016200 543705 disk_info.go:196] parse disk info done, disk is : [0xc000567b00 0xc000567b40]
E0321 01:11:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:11:33.409795 543705 memory.go:184] no items to output this cycle
I0321 01:11:33.409809 543705 cpu.go:275] no items to output this cycle
E0321 01:11:43.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:11:43.409778 543705 memory.go:191] Add success.
I0321 01:11:43.409799 543705 cpu.go:282] Add success.
I0321 01:11:43.419873 543705 net.go:648] Add success.
I0321 01:11:43.422737 543705 net.go:770] primary dev: ETH0
I0321 01:11:43.422750 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:11:43.422763 543705 net.go:698] Add success.
I0321 01:11:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:11:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:11:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:11:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:11:53.409777 543705 memory.go:184] no items to output this cycle
I0321 01:11:53.409796 543705 cpu.go:275] no items to output this cycle
E0321 01:12:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:12:03.409780 543705 memory.go:184] no items to output this cycle
I0321 01:12:03.409781 543705 cpu.go:275] no items to output this cycle
E0321 01:12:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:12:13.409798 543705 memory.go:191] Add success.
I0321 01:12:13.409800 543705 cpu.go:282] Add success.
W0321 01:12:13.409956 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:12:13.409980 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:12:13.409983 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:12:13.419734 543705 net.go:648] Add success.
I0321 01:12:13.422596 543705 net.go:770] primary dev: ETH0
I0321 01:12:13.422610 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:12:13.422622 543705 net.go:698] Add success.
I0321 01:12:13.468805 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"e71f31ae-792a-4c9d-b8d1-0afdbb0bec1d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 01:12:13.468837 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0321 01:12:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:12:14.455196 543705 disk_worker.go:708] disk space is not compliant
W0321 01:12:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:12:14.456839 543705 disk_worker.go:494] system disk:vda1
I0321 01:12:14.456877 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 01:12:14.457165 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 01:12:14.457173 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 01:12:14.457178 543705 custom_config.go:64] query custom config with name: gpu
E0321 01:12:15.456834 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 01:12:15.456843 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:12:16.457932 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 01:12:16.457941 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 01:12:16.457986 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:12:16.458002 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:12:16.472343 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:12:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:12:23.409774 543705 memory.go:184] no items to output this cycle
I0321 01:12:23.409805 543705 cpu.go:275] no items to output this cycle
I0321 01:12:27.017674 543705 disk_info.go:125] begin check local disk info of client
I0321 01:12:27.020286 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:12:27.020292 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5380 0xc0000c53c0]
E0321 01:12:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:12:33.409770 543705 memory.go:184] no items to output this cycle
I0321 01:12:33.409792 543705 cpu.go:275] no items to output this cycle
I0321 01:12:38.680249 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 01:12:38.680255 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 01:12:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:12:43.409775 543705 memory.go:191] Add success.
I0321 01:12:43.409785 543705 cpu.go:282] Add success.
I0321 01:12:43.419848 543705 net.go:648] Add success.
I0321 01:12:43.420805 543705 net.go:770] primary dev: ETH0
I0321 01:12:43.420818 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:12:43.420831 543705 net.go:698] Add success.
I0321 01:12:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:12:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:12:46.458060 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:12:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:12:53.409781 543705 memory.go:184] no items to output this cycle
I0321 01:12:53.409785 543705 cpu.go:275] no items to output this cycle
E0321 01:13:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:13:03.409793 543705 memory.go:184] no items to output this cycle
I0321 01:13:03.409806 543705 cpu.go:275] no items to output this cycle
E0321 01:13:13.409862 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:13:13.409897 543705 memory.go:191] Add success.
W0321 01:13:13.409934 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0321 01:13:13.409944 543705 cpu.go:282] Add success.
W0321 01:13:13.409952 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:13:13.409964 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:13:13.419714 543705 net.go:648] Add success.
I0321 01:13:13.422671 543705 net.go:770] primary dev: ETH0
I0321 01:13:13.422685 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:13:13.422696 543705 net.go:698] Add success.
I0321 01:13:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:13:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:13:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0321 01:13:14.455179 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:13:14.456505 543705 disk_worker.go:494] system disk:vda1
I0321 01:13:14.456548 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:13:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:13:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:13:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:13:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:13:16.472414 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:13:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:13:23.409799 543705 memory.go:184] no items to output this cycle
I0321 01:13:23.409812 543705 cpu.go:275] no items to output this cycle
I0321 01:13:27.021672 543705 disk_info.go:125] begin check local disk info of client
I0321 01:13:27.024128 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:13:27.024134 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa100 0xc0001fa140]
E0321 01:13:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:13:33.409776 543705 memory.go:184] no items to output this cycle
I0321 01:13:33.409779 543705 cpu.go:275] no items to output this cycle
E0321 01:13:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:13:43.409790 543705 memory.go:191] Add success.
I0321 01:13:43.409791 543705 cpu.go:282] Add success.
I0321 01:13:43.419895 543705 net.go:648] Add success.
I0321 01:13:43.422651 543705 net.go:770] primary dev: ETH0
I0321 01:13:43.422665 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:13:43.422677 543705 net.go:698] Add success.
I0321 01:13:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:13:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:13:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:13:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:13:53.409781 543705 memory.go:184] no items to output this cycle
I0321 01:13:53.409788 543705 cpu.go:275] no items to output this cycle
E0321 01:14:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:14:03.409773 543705 memory.go:184] no items to output this cycle
I0321 01:14:03.409798 543705 cpu.go:275] no items to output this cycle
E0321 01:14:13.409903 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:14:13.409936 543705 memory.go:191] Add success.
I0321 01:14:13.409945 543705 cpu.go:282] Add success.
W0321 01:14:13.409968 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:14:13.409981 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:14:13.409989 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:14:13.419716 543705 net.go:648] Add success.
I0321 01:14:13.422305 543705 net.go:770] primary dev: ETH0
I0321 01:14:13.422328 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:14:13.422340 543705 net.go:698] Add success.
I0321 01:14:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:14:14.455119 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:14:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0321 01:14:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:14:14.456619 543705 disk_worker.go:494] system disk:vda1
I0321 01:14:14.456654 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:14:15.454998 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:14:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:14:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:14:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:14:16.472400 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:14:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:14:23.409784 543705 memory.go:184] no items to output this cycle
I0321 01:14:23.409800 543705 cpu.go:275] no items to output this cycle
I0321 01:14:27.025675 543705 disk_info.go:125] begin check local disk info of client
I0321 01:14:27.028226 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:14:27.028234 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b080 0xc00007b0c0]
E0321 01:14:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:14:33.409773 543705 memory.go:184] no items to output this cycle
I0321 01:14:33.409789 543705 cpu.go:275] no items to output this cycle
E0321 01:14:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:14:43.409812 543705 memory.go:191] Add success.
I0321 01:14:43.409819 543705 cpu.go:282] Add success.
I0321 01:14:43.419875 543705 net.go:648] Add success.
I0321 01:14:43.422729 543705 net.go:770] primary dev: ETH0
I0321 01:14:43.422741 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:14:43.422754 543705 net.go:698] Add success.
I0321 01:14:46.457995 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:14:46.458075 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:14:46.458102 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:14:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:14:53.409773 543705 memory.go:184] no items to output this cycle
I0321 01:14:53.409806 543705 cpu.go:275] no items to output this cycle
E0321 01:15:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:15:03.409796 543705 memory.go:184] no items to output this cycle
I0321 01:15:03.409809 543705 cpu.go:275] no items to output this cycle
E0321 01:15:13.409873 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:15:13.409907 543705 memory.go:191] Add success.
I0321 01:15:13.409911 543705 cpu.go:282] Add success.
W0321 01:15:13.409953 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:15:13.409976 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:15:13.409980 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:15:13.419739 543705 net.go:648] Add success.
I0321 01:15:13.422471 543705 net.go:770] primary dev: ETH0
I0321 01:15:13.422483 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:15:13.422496 543705 net.go:698] Add success.
I0321 01:15:13.463665 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"57377629-c537-47a3-acff-133f1d2efb0f","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 01:15:13.463694 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 01:15:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:15:14.455100 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:15:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0321 01:15:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:15:14.456588 543705 disk_worker.go:494] system disk:vda1
I0321 01:15:14.456618 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:15:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:15:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:15:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:15:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:15:16.472410 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:15:23.410239 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:15:23.410257 543705 memory.go:184] no items to output this cycle
I0321 01:15:23.410273 543705 cpu.go:275] no items to output this cycle
I0321 01:15:27.029673 543705 disk_info.go:125] begin check local disk info of client
I0321 01:15:27.032179 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:15:27.032186 543705 disk_info.go:196] parse disk info done, disk is : [0xc000546e80 0xc000546ec0]
E0321 01:15:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:15:33.409785 543705 memory.go:184] no items to output this cycle
I0321 01:15:33.409789 543705 cpu.go:275] no items to output this cycle
I0321 01:15:38.680393 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 01:15:38.680400 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 01:15:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:15:43.410751 543705 memory.go:191] Add success.
I0321 01:15:43.409789 543705 cpu.go:282] Add success.
I0321 01:15:43.420431 543705 net.go:648] Add success.
I0321 01:15:43.423697 543705 net.go:770] primary dev: ETH0
I0321 01:15:43.423710 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:15:43.423730 543705 net.go:698] Add success.
I0321 01:15:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:15:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:15:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:15:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:15:53.409782 543705 memory.go:184] no items to output this cycle
I0321 01:15:53.409786 543705 cpu.go:275] no items to output this cycle
E0321 01:16:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:16:03.409904 543705 memory.go:184] no items to output this cycle
I0321 01:16:03.409919 543705 cpu.go:275] no items to output this cycle
E0321 01:16:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:16:13.409787 543705 memory.go:191] Add success.
W0321 01:16:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0321 01:16:13.409815 543705 cpu.go:282] Add success.
W0321 01:16:13.409826 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:16:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:16:13.420141 543705 net.go:648] Add success.
I0321 01:16:13.422774 543705 net.go:770] primary dev: ETH0
I0321 01:16:13.422786 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:16:13.422799 543705 net.go:698] Add success.
I0321 01:16:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:16:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:16:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0321 01:16:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:16:14.456613 543705 disk_worker.go:494] system disk:vda1
I0321 01:16:14.456642 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:16:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:16:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:16:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:16:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:16:16.472440 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:16:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:16:23.409809 543705 memory.go:184] no items to output this cycle
I0321 01:16:23.409813 543705 cpu.go:275] no items to output this cycle
I0321 01:16:27.033674 543705 disk_info.go:125] begin check local disk info of client
I0321 01:16:27.036208 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:16:27.036214 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000ed280 0xc0000ed2c0]
E0321 01:16:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:16:33.409767 543705 memory.go:184] no items to output this cycle
I0321 01:16:33.409800 543705 cpu.go:275] no items to output this cycle
E0321 01:16:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:16:43.409809 543705 memory.go:191] Add success.
I0321 01:16:43.409817 543705 cpu.go:282] Add success.
I0321 01:16:43.420054 543705 net.go:648] Add success.
I0321 01:16:43.422891 543705 net.go:770] primary dev: ETH0
I0321 01:16:43.422903 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:16:43.422914 543705 net.go:698] Add success.
I0321 01:16:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:16:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:16:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:16:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:16:53.409797 543705 memory.go:184] no items to output this cycle
I0321 01:16:53.409808 543705 cpu.go:275] no items to output this cycle
E0321 01:17:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:17:03.409776 543705 memory.go:184] no items to output this cycle
I0321 01:17:03.409783 543705 cpu.go:275] no items to output this cycle
E0321 01:17:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:17:13.409804 543705 memory.go:191] Add success.
I0321 01:17:13.409805 543705 cpu.go:282] Add success.
W0321 01:17:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:17:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:17:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:17:13.420218 543705 net.go:648] Add success.
I0321 01:17:13.423037 543705 net.go:770] primary dev: ETH0
I0321 01:17:13.423050 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:17:13.423063 543705 net.go:698] Add success.
I0321 01:17:13.453721 543705 event_worker.go:152] Polling the log file for events...
W0321 01:17:14.455126 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:17:14.455189 543705 disk_worker.go:708] disk space is not compliant
W0321 01:17:14.455192 543705 disk_worker.go:728] disk inode is not compliant
E0321 01:17:14.455937 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 01:17:14.455946 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 01:17:14.455951 543705 custom_config.go:64] query custom config with name: gpu
I0321 01:17:14.456595 543705 disk_worker.go:494] system disk:vda1
I0321 01:17:14.456627 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 01:17:15.456862 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 01:17:15.456870 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:17:16.457935 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 01:17:16.457935 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 01:17:16.457990 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:17:16.458009 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:17:16.472342 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:17:23.410559 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:17:23.410579 543705 memory.go:184] no items to output this cycle
I0321 01:17:23.410589 543705 cpu.go:275] no items to output this cycle
I0321 01:17:27.037672 543705 disk_info.go:125] begin check local disk info of client
I0321 01:17:27.040248 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:17:27.040254 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fabc0 0xc0001fac00]
E0321 01:17:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:17:33.409792 543705 memory.go:184] no items to output this cycle
I0321 01:17:33.409804 543705 cpu.go:275] no items to output this cycle
E0321 01:17:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:17:43.409795 543705 memory.go:191] Add success.
I0321 01:17:43.409810 543705 cpu.go:282] Add success.
I0321 01:17:43.420016 543705 net.go:648] Add success.
I0321 01:17:43.422742 543705 net.go:770] primary dev: ETH0
I0321 01:17:43.422757 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:17:43.422770 543705 net.go:698] Add success.
I0321 01:17:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:17:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:17:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:17:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:17:53.409768 543705 memory.go:184] no items to output this cycle
I0321 01:17:53.409797 543705 cpu.go:275] no items to output this cycle
E0321 01:18:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:18:03.409792 543705 memory.go:184] no items to output this cycle
I0321 01:18:03.409806 543705 cpu.go:275] no items to output this cycle
E0321 01:18:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:18:13.409798 543705 cpu.go:282] Add success.
I0321 01:18:13.409799 543705 memory.go:191] Add success.
W0321 01:18:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:18:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:18:13.409843 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:18:13.419718 543705 net.go:648] Add success.
I0321 01:18:13.423216 543705 net.go:770] primary dev: ETH0
I0321 01:18:13.423230 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:18:13.423244 543705 net.go:698] Add success.
I0321 01:18:13.475093 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"2da4d194-d2a4-46b0-b9b5-b2893001600a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 01:18:13.475123 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 01:18:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:18:14.455160 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:18:14.455171 543705 disk_worker.go:708] disk space is not compliant
W0321 01:18:14.455173 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:18:14.456515 543705 disk_worker.go:494] system disk:vda1
I0321 01:18:14.456561 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:18:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:18:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:18:16.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:18:16.458056 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:18:16.472387 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:18:23.410247 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:18:23.410265 543705 memory.go:184] no items to output this cycle
I0321 01:18:23.410287 543705 cpu.go:275] no items to output this cycle
I0321 01:18:27.041683 543705 disk_info.go:125] begin check local disk info of client
I0321 01:18:27.044259 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:18:27.044265 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003565c0 0xc000356600]
E0321 01:18:33.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:18:33.409787 543705 memory.go:184] no items to output this cycle
I0321 01:18:33.409796 543705 cpu.go:275] no items to output this cycle
I0321 01:18:38.681095 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 01:18:38.681101 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 01:18:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:18:43.410595 543705 memory.go:191] Add success.
I0321 01:18:43.409807 543705 cpu.go:282] Add success.
I0321 01:18:43.420317 543705 net.go:648] Add success.
I0321 01:18:43.422782 543705 net.go:770] primary dev: ETH0
I0321 01:18:43.422795 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:18:43.422810 543705 net.go:698] Add success.
I0321 01:18:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:18:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:18:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:18:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:18:53.409783 543705 memory.go:184] no items to output this cycle
I0321 01:18:53.409798 543705 cpu.go:275] no items to output this cycle
E0321 01:19:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:19:03.409776 543705 memory.go:184] no items to output this cycle
I0321 01:19:03.409809 543705 cpu.go:275] no items to output this cycle
E0321 01:19:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:19:13.409825 543705 memory.go:191] Add success.
I0321 01:19:13.409829 543705 cpu.go:282] Add success.
W0321 01:19:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:19:13.410039 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:19:13.410046 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:19:13.419744 543705 net.go:648] Add success.
I0321 01:19:13.422317 543705 net.go:770] primary dev: ETH0
I0321 01:19:13.422331 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:19:13.422342 543705 net.go:698] Add success.
I0321 01:19:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:19:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:19:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0321 01:19:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:19:14.456584 543705 disk_worker.go:494] system disk:vda1
I0321 01:19:14.456613 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:19:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:19:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:19:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:19:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:19:16.472380 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:19:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:19:23.409798 543705 memory.go:184] no items to output this cycle
I0321 01:19:23.409813 543705 cpu.go:275] no items to output this cycle
I0321 01:19:27.045672 543705 disk_info.go:125] begin check local disk info of client
I0321 01:19:27.048168 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:19:27.048174 543705 disk_info.go:196] parse disk info done, disk is : [0xc00056de40 0xc00056de80]
E0321 01:19:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:19:33.409778 543705 memory.go:184] no items to output this cycle
I0321 01:19:33.409781 543705 cpu.go:275] no items to output this cycle
E0321 01:19:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:19:43.409789 543705 memory.go:191] Add success.
I0321 01:19:43.409804 543705 cpu.go:282] Add success.
I0321 01:19:43.419861 543705 net.go:648] Add success.
I0321 01:19:43.422483 543705 net.go:770] primary dev: ETH0
I0321 01:19:43.422497 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:19:43.422509 543705 net.go:698] Add success.
I0321 01:19:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:19:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:19:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:19:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:19:53.409804 543705 memory.go:184] no items to output this cycle
I0321 01:19:53.409814 543705 cpu.go:275] no items to output this cycle
E0321 01:20:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:20:03.409803 543705 memory.go:184] no items to output this cycle
I0321 01:20:03.409811 543705 cpu.go:275] no items to output this cycle
E0321 01:20:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:20:13.409791 543705 memory.go:191] Add success.
I0321 01:20:13.409806 543705 cpu.go:282] Add success.
W0321 01:20:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:20:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:20:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:20:13.420133 543705 net.go:648] Add success.
I0321 01:20:13.423312 543705 net.go:770] primary dev: ETH0
I0321 01:20:13.423390 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:20:13.423404 543705 net.go:698] Add success.
I0321 01:20:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:20:14.455138 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:20:14.455149 543705 disk_worker.go:708] disk space is not compliant
W0321 01:20:14.455151 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:20:14.456473 543705 disk_worker.go:494] system disk:vda1
I0321 01:20:14.456517 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:20:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:20:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:20:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:20:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:20:16.472355 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:20:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:20:23.409788 543705 memory.go:184] no items to output this cycle
I0321 01:20:23.409790 543705 cpu.go:275] no items to output this cycle
I0321 01:20:27.049679 543705 disk_info.go:125] begin check local disk info of client
I0321 01:20:27.052250 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:20:27.052257 543705 disk_info.go:196] parse disk info done, disk is : [0xc000475600 0xc000475640]
E0321 01:20:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:20:33.409799 543705 memory.go:184] no items to output this cycle
I0321 01:20:33.409810 543705 cpu.go:275] no items to output this cycle
E0321 01:20:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:20:43.409784 543705 memory.go:191] Add success.
I0321 01:20:43.409810 543705 cpu.go:282] Add success.
I0321 01:20:43.419903 543705 net.go:648] Add success.
I0321 01:20:43.422822 543705 net.go:770] primary dev: ETH0
I0321 01:20:43.422837 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:20:43.422851 543705 net.go:698] Add success.
I0321 01:20:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:20:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:20:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:20:53.410339 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:20:53.410364 543705 memory.go:184] no items to output this cycle
I0321 01:20:53.410379 543705 cpu.go:275] no items to output this cycle
E0321 01:21:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:21:03.409782 543705 cpu.go:275] no items to output this cycle
I0321 01:21:03.409784 543705 memory.go:184] no items to output this cycle
E0321 01:21:13.409802 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:21:13.409839 543705 memory.go:191] Add success.
I0321 01:21:13.409841 543705 cpu.go:282] Add success.
W0321 01:21:13.409869 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:21:13.409880 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:21:13.409883 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:21:13.420215 543705 net.go:648] Add success.
I0321 01:21:13.422801 543705 net.go:770] primary dev: ETH0
I0321 01:21:13.422814 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:21:13.422826 543705 net.go:698] Add success.
I0321 01:21:13.463575 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1264c456-60c3-4e8c-b67b-9fc7c50ba7d6","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 01:21:13.463607 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 01:21:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:21:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:21:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0321 01:21:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:21:14.459223 543705 disk_worker.go:494] system disk:vda1
I0321 01:21:14.459253 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:21:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:21:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:21:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:21:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:21:16.472373 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:21:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:21:23.409799 543705 memory.go:184] no items to output this cycle
I0321 01:21:23.409809 543705 cpu.go:275] no items to output this cycle
I0321 01:21:27.053674 543705 disk_info.go:125] begin check local disk info of client
I0321 01:21:27.056440 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:21:27.056447 543705 disk_info.go:196] parse disk info done, disk is : [0xc00056d3c0 0xc00056d400]
E0321 01:21:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:21:33.409776 543705 cpu.go:275] no items to output this cycle
I0321 01:21:33.409781 543705 memory.go:184] no items to output this cycle
I0321 01:21:38.681746 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 01:21:38.681753 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 01:21:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:21:43.410666 543705 memory.go:191] Add success.
I0321 01:21:43.409809 543705 cpu.go:282] Add success.
I0321 01:21:43.420385 543705 net.go:648] Add success.
I0321 01:21:43.423189 543705 net.go:770] primary dev: ETH0
I0321 01:21:43.423203 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:21:43.423216 543705 net.go:698] Add success.
I0321 01:21:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:21:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:21:46.458088 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:21:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:21:53.409778 543705 memory.go:184] no items to output this cycle
I0321 01:21:53.409780 543705 cpu.go:275] no items to output this cycle
E0321 01:22:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:22:03.409776 543705 memory.go:184] no items to output this cycle
I0321 01:22:03.409780 543705 cpu.go:275] no items to output this cycle
E0321 01:22:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:22:13.409823 543705 memory.go:191] Add success.
I0321 01:22:13.409827 543705 cpu.go:282] Add success.
W0321 01:22:13.409855 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:22:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:22:13.409876 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:22:13.420201 543705 net.go:648] Add success.
I0321 01:22:13.422917 543705 net.go:770] primary dev: ETH0
I0321 01:22:13.422933 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:22:13.422947 543705 net.go:698] Add success.
W0321 01:22:14.455174 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:22:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0321 01:22:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:22:14.456791 543705 disk_worker.go:494] system disk:vda1
I0321 01:22:14.456834 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 01:22:14.457014 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 01:22:14.457023 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 01:22:14.457029 543705 custom_config.go:64] query custom config with name: gpu
E0321 01:22:15.456809 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 01:22:15.456818 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:22:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 01:22:16.457979 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 01:22:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:22:16.458049 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:22:16.472393 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:22:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:22:23.409784 543705 memory.go:184] no items to output this cycle
I0321 01:22:23.409798 543705 cpu.go:275] no items to output this cycle
I0321 01:22:27.057680 543705 disk_info.go:125] begin check local disk info of client
I0321 01:22:27.060238 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:22:27.060245 543705 disk_info.go:196] parse disk info done, disk is : [0xc000466580 0xc0004665c0]
E0321 01:22:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:22:33.409776 543705 memory.go:184] no items to output this cycle
I0321 01:22:33.409781 543705 cpu.go:275] no items to output this cycle
E0321 01:22:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:22:43.409796 543705 memory.go:191] Add success.
I0321 01:22:43.409799 543705 cpu.go:282] Add success.
I0321 01:22:43.419999 543705 net.go:648] Add success.
I0321 01:22:43.422723 543705 net.go:770] primary dev: ETH0
I0321 01:22:43.422736 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:22:43.422749 543705 net.go:698] Add success.
I0321 01:22:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:22:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:22:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:22:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:22:53.409791 543705 memory.go:184] no items to output this cycle
I0321 01:22:53.409803 543705 cpu.go:275] no items to output this cycle
E0321 01:23:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:23:03.409777 543705 cpu.go:275] no items to output this cycle
I0321 01:23:03.409782 543705 memory.go:184] no items to output this cycle
E0321 01:23:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:23:13.409822 543705 memory.go:191] Add success.
I0321 01:23:13.409830 543705 cpu.go:282] Add success.
W0321 01:23:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:23:13.409871 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:23:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:23:13.420182 543705 net.go:648] Add success.
I0321 01:23:13.422936 543705 net.go:770] primary dev: ETH0
I0321 01:23:13.422949 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:23:13.422962 543705 net.go:698] Add success.
I0321 01:23:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:23:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:23:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0321 01:23:14.455197 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:23:14.456599 543705 disk_worker.go:494] system disk:vda1
I0321 01:23:14.456629 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:23:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:23:16.457967 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:23:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:23:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:23:16.472409 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:23:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:23:23.409771 543705 memory.go:184] no items to output this cycle
I0321 01:23:23.409798 543705 cpu.go:275] no items to output this cycle
I0321 01:23:27.061674 543705 disk_info.go:125] begin check local disk info of client
I0321 01:23:27.064167 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:23:27.064173 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004a1680 0xc0004a16c0]
E0321 01:23:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:23:33.409777 543705 cpu.go:275] no items to output this cycle
I0321 01:23:33.409785 543705 memory.go:184] no items to output this cycle
E0321 01:23:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:23:43.409782 543705 memory.go:191] Add success.
I0321 01:23:43.409804 543705 cpu.go:282] Add success.
I0321 01:23:43.419887 543705 net.go:648] Add success.
I0321 01:23:43.422478 543705 net.go:770] primary dev: ETH0
I0321 01:23:43.422492 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:23:43.422504 543705 net.go:698] Add success.
I0321 01:23:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:23:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:23:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:23:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:23:53.409802 543705 memory.go:184] no items to output this cycle
I0321 01:23:53.409812 543705 cpu.go:275] no items to output this cycle
E0321 01:24:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:24:03.409777 543705 memory.go:184] no items to output this cycle
I0321 01:24:03.409797 543705 cpu.go:275] no items to output this cycle
E0321 01:24:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:24:13.409826 543705 memory.go:191] Add success.
I0321 01:24:13.409830 543705 cpu.go:282] Add success.
W0321 01:24:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:24:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:24:13.409878 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:24:13.420165 543705 net.go:648] Add success.
I0321 01:24:13.422859 543705 net.go:770] primary dev: ETH0
I0321 01:24:13.422873 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:24:13.422888 543705 net.go:698] Add success.
I0321 01:24:13.463872 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"609da7ed-b181-4781-af1a-cfa8e2581639","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 01:24:13.463910 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 01:24:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:24:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:24:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0321 01:24:14.455206 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:24:14.456734 543705 disk_worker.go:494] system disk:vda1
I0321 01:24:14.456767 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:24:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:24:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:24:16.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:24:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:24:16.472450 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:24:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:24:23.409784 543705 memory.go:184] no items to output this cycle
I0321 01:24:23.409802 543705 cpu.go:275] no items to output this cycle
I0321 01:24:27.065677 543705 disk_info.go:125] begin check local disk info of client
I0321 01:24:27.068256 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:24:27.068262 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001e3900 0xc0001e3940]
E0321 01:24:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:24:33.409811 543705 memory.go:184] no items to output this cycle
I0321 01:24:33.409819 543705 cpu.go:275] no items to output this cycle
I0321 01:24:38.683112 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 01:24:38.683119 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 01:24:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:24:43.410554 543705 memory.go:191] Add success.
I0321 01:24:43.409791 543705 cpu.go:282] Add success.
I0321 01:24:43.420356 543705 net.go:648] Add success.
I0321 01:24:43.423062 543705 net.go:770] primary dev: ETH0
I0321 01:24:43.423078 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:24:43.423093 543705 net.go:698] Add success.
I0321 01:24:46.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:24:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:24:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:24:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:24:53.409772 543705 memory.go:184] no items to output this cycle
I0321 01:24:53.409781 543705 cpu.go:275] no items to output this cycle
E0321 01:25:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:25:03.409766 543705 memory.go:184] no items to output this cycle
I0321 01:25:03.409787 543705 cpu.go:275] no items to output this cycle
E0321 01:25:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:25:13.409825 543705 memory.go:191] Add success.
I0321 01:25:13.409831 543705 cpu.go:282] Add success.
W0321 01:25:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:25:13.409873 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:25:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:25:13.420507 543705 net.go:648] Add success.
I0321 01:25:13.423234 543705 net.go:770] primary dev: ETH0
I0321 01:25:13.423250 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:25:13.423264 543705 net.go:698] Add success.
I0321 01:25:14.454956 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:25:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:25:14.455205 543705 disk_worker.go:708] disk space is not compliant
W0321 01:25:14.455208 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:25:14.456611 543705 disk_worker.go:494] system disk:vda1
I0321 01:25:14.456642 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:25:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:25:16.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:25:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:25:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:25:16.472396 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:25:23.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:25:23.409804 543705 memory.go:184] no items to output this cycle
I0321 01:25:23.409814 543705 cpu.go:275] no items to output this cycle
I0321 01:25:27.069674 543705 disk_info.go:125] begin check local disk info of client
I0321 01:25:27.072177 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:25:27.072183 543705 disk_info.go:196] parse disk info done, disk is : [0xc000492b00 0xc000492b40]
E0321 01:25:33.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:25:33.409761 543705 memory.go:184] no items to output this cycle
I0321 01:25:33.409793 543705 cpu.go:275] no items to output this cycle
E0321 01:25:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:25:43.409813 543705 memory.go:191] Add success.
I0321 01:25:43.409825 543705 cpu.go:282] Add success.
I0321 01:25:43.420112 543705 net.go:648] Add success.
I0321 01:25:43.422762 543705 net.go:770] primary dev: ETH0
I0321 01:25:43.422777 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:25:43.422791 543705 net.go:698] Add success.
I0321 01:25:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:25:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:25:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:25:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:25:53.409779 543705 cpu.go:275] no items to output this cycle
I0321 01:25:53.409787 543705 memory.go:184] no items to output this cycle
E0321 01:26:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:26:03.409776 543705 memory.go:184] no items to output this cycle
I0321 01:26:03.409780 543705 cpu.go:275] no items to output this cycle
E0321 01:26:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:26:13.409806 543705 memory.go:191] Add success.
I0321 01:26:13.409808 543705 cpu.go:282] Add success.
W0321 01:26:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:26:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:26:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:26:13.420233 543705 net.go:648] Add success.
I0321 01:26:13.423082 543705 net.go:770] primary dev: ETH0
I0321 01:26:13.423097 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:26:13.423110 543705 net.go:698] Add success.
I0321 01:26:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:26:14.455189 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:26:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0321 01:26:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:26:14.456602 543705 disk_worker.go:494] system disk:vda1
I0321 01:26:14.456633 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:26:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:26:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:26:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:26:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:26:16.472393 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:26:23.410229 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:26:23.410251 543705 memory.go:184] no items to output this cycle
I0321 01:26:23.410257 543705 cpu.go:275] no items to output this cycle
I0321 01:26:27.073666 543705 disk_info.go:125] begin check local disk info of client
I0321 01:26:27.076229 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:26:27.076235 543705 disk_info.go:196] parse disk info done, disk is : [0xc00046a340 0xc00046a380]
E0321 01:26:33.409881 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:26:33.409908 543705 memory.go:184] no items to output this cycle
I0321 01:26:33.410057 543705 cpu.go:275] no items to output this cycle
E0321 01:26:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:26:43.409781 543705 memory.go:191] Add success.
I0321 01:26:43.409808 543705 cpu.go:282] Add success.
I0321 01:26:43.419991 543705 net.go:648] Add success.
I0321 01:26:43.422960 543705 net.go:770] primary dev: ETH0
I0321 01:26:43.422974 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:26:43.422986 543705 net.go:698] Add success.
I0321 01:26:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:26:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:26:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:26:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:26:53.409801 543705 memory.go:184] no items to output this cycle
I0321 01:26:53.409810 543705 cpu.go:275] no items to output this cycle
E0321 01:27:03.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:27:03.409799 543705 memory.go:184] no items to output this cycle
I0321 01:27:03.409811 543705 cpu.go:275] no items to output this cycle
W0321 01:27:13.409701 543705 conf_downlod.go:84] cant't download conf: Get "": unsupported protocol scheme ""
W0321 01:27:13.409712 543705 conf_downlod.go:89] use old conf
E0321 01:27:13.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:27:13.409828 543705 memory.go:191] Add success.
I0321 01:27:13.409844 543705 cpu.go:282] Add success.
W0321 01:27:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:27:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:27:13.409875 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:27:13.420187 543705 net.go:648] Add success.
I0321 01:27:13.423282 543705 net.go:770] primary dev: ETH0
I0321 01:27:13.423295 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:27:13.423308 543705 net.go:698] Add success.
I0321 01:27:13.429405 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0321 01:27:13.453580 543705 event_worker.go:152] Polling the log file for events...
I0321 01:27:13.469762 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a3e742cc-0bfc-451f-a323-f068bc8c99bc","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 01:27:13.469797 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0321 01:27:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:27:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0321 01:27:14.455194 543705 disk_worker.go:728] disk inode is not compliant
E0321 01:27:14.455887 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 01:27:14.455897 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 01:27:14.455902 543705 custom_config.go:64] query custom config with name: gpu
I0321 01:27:14.456550 543705 disk_worker.go:494] system disk:vda1
I0321 01:27:14.456580 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 01:27:15.456835 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 01:27:15.456844 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:27:16.457916 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 01:27:16.457915 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 01:27:16.457970 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:27:16.457989 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:27:16.472320 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:27:23.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:27:23.409807 543705 memory.go:184] no items to output this cycle
I0321 01:27:23.409813 543705 cpu.go:275] no items to output this cycle
I0321 01:27:27.077675 543705 disk_info.go:125] begin check local disk info of client
I0321 01:27:27.080280 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:27:27.080287 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa000 0xc0001aa040]
E0321 01:27:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:27:33.409792 543705 memory.go:184] no items to output this cycle
I0321 01:27:33.409806 543705 cpu.go:275] no items to output this cycle
I0321 01:27:38.684126 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 01:27:38.684134 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 01:27:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:27:43.410695 543705 memory.go:191] Add success.
I0321 01:27:43.409787 543705 cpu.go:282] Add success.
I0321 01:27:43.420394 543705 net.go:648] Add success.
I0321 01:27:43.423004 543705 net.go:770] primary dev: ETH0
I0321 01:27:43.423017 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:27:43.423029 543705 net.go:698] Add success.
I0321 01:27:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:27:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:27:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:27:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:27:53.409772 543705 memory.go:184] no items to output this cycle
I0321 01:27:53.409779 543705 cpu.go:275] no items to output this cycle
E0321 01:28:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:28:03.409795 543705 memory.go:184] no items to output this cycle
I0321 01:28:03.409806 543705 cpu.go:275] no items to output this cycle
E0321 01:28:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:28:13.409804 543705 memory.go:191] Add success.
I0321 01:28:13.409806 543705 cpu.go:282] Add success.
W0321 01:28:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:28:13.409844 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:28:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:28:13.420083 543705 net.go:648] Add success.
I0321 01:28:13.422576 543705 net.go:770] primary dev: ETH0
I0321 01:28:13.422591 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:28:13.422606 543705 net.go:698] Add success.
I0321 01:28:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:28:14.455127 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:28:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0321 01:28:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:28:14.456582 543705 disk_worker.go:494] system disk:vda1
I0321 01:28:14.456611 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:28:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:28:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:28:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:28:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:28:16.472391 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:28:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:28:23.409793 543705 cpu.go:275] no items to output this cycle
I0321 01:28:23.409796 543705 memory.go:184] no items to output this cycle
I0321 01:28:27.081680 543705 disk_info.go:125] begin check local disk info of client
I0321 01:28:27.084230 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:28:27.084236 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004bdc80 0xc0004bdcc0]
E0321 01:28:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:28:33.409781 543705 memory.go:184] no items to output this cycle
I0321 01:28:33.409807 543705 cpu.go:275] no items to output this cycle
E0321 01:28:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:28:43.409812 543705 memory.go:191] Add success.
I0321 01:28:43.409823 543705 cpu.go:282] Add success.
I0321 01:28:43.419996 543705 net.go:648] Add success.
I0321 01:28:43.422873 543705 net.go:770] primary dev: ETH0
I0321 01:28:43.422885 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:28:43.422898 543705 net.go:698] Add success.
I0321 01:28:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:28:46.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:28:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:28:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:28:53.409785 543705 memory.go:184] no items to output this cycle
I0321 01:28:53.409785 543705 cpu.go:275] no items to output this cycle
E0321 01:29:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:29:03.409777 543705 memory.go:184] no items to output this cycle
I0321 01:29:03.409781 543705 cpu.go:275] no items to output this cycle
E0321 01:29:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:29:13.409793 543705 memory.go:191] Add success.
I0321 01:29:13.409811 543705 cpu.go:282] Add success.
W0321 01:29:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:29:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:29:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:29:13.420150 543705 net.go:648] Add success.
I0321 01:29:13.422966 543705 net.go:770] primary dev: ETH0
I0321 01:29:13.422979 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:29:13.422991 543705 net.go:698] Add success.
I0321 01:29:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:29:14.455195 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:29:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0321 01:29:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:29:14.456601 543705 disk_worker.go:494] system disk:vda1
I0321 01:29:14.456633 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:29:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:29:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:29:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:29:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:29:16.472414 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:29:23.410436 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:29:23.410458 543705 memory.go:184] no items to output this cycle
I0321 01:29:23.410466 543705 cpu.go:275] no items to output this cycle
I0321 01:29:27.085664 543705 disk_info.go:125] begin check local disk info of client
I0321 01:29:27.088132 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:29:27.088138 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e9640 0xc0003e9680]
E0321 01:29:33.409868 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:29:33.409891 543705 memory.go:184] no items to output this cycle
I0321 01:29:33.409968 543705 cpu.go:275] no items to output this cycle
E0321 01:29:43.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:29:43.409813 543705 memory.go:191] Add success.
I0321 01:29:43.409821 543705 cpu.go:282] Add success.
I0321 01:29:43.419876 543705 net.go:648] Add success.
I0321 01:29:43.422545 543705 net.go:770] primary dev: ETH0
I0321 01:29:43.422558 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:29:43.422570 543705 net.go:698] Add success.
I0321 01:29:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:29:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:29:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:29:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:29:53.409776 543705 memory.go:184] no items to output this cycle
I0321 01:29:53.409775 543705 cpu.go:275] no items to output this cycle
E0321 01:30:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:30:03.409776 543705 memory.go:184] no items to output this cycle
I0321 01:30:03.409783 543705 cpu.go:275] no items to output this cycle
E0321 01:30:13.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:30:13.409831 543705 memory.go:191] Add success.
I0321 01:30:13.409833 543705 cpu.go:282] Add success.
W0321 01:30:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:30:13.409881 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:30:13.409884 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:30:13.420210 543705 net.go:648] Add success.
I0321 01:30:13.423081 543705 net.go:770] primary dev: ETH0
I0321 01:30:13.423096 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:30:13.423110 543705 net.go:698] Add success.
I0321 01:30:13.470050 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8bacd184-cccb-4dc5-beb2-630dfd7b9b04","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 01:30:13.470083 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 01:30:14.454980 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:30:14.455212 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:30:14.455223 543705 disk_worker.go:708] disk space is not compliant
W0321 01:30:14.455225 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:30:14.456628 543705 disk_worker.go:494] system disk:vda1
I0321 01:30:14.456656 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:30:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:30:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:30:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:30:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:30:16.472407 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:30:23.410273 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:30:23.410294 543705 memory.go:184] no items to output this cycle
I0321 01:30:23.410314 543705 cpu.go:275] no items to output this cycle
I0321 01:30:27.089675 543705 disk_info.go:125] begin check local disk info of client
I0321 01:30:27.092195 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:30:27.092202 543705 disk_info.go:196] parse disk info done, disk is : [0xc00035e800 0xc00035e840]
E0321 01:30:33.409896 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:30:33.409916 543705 memory.go:184] no items to output this cycle
I0321 01:30:33.409936 543705 cpu.go:275] no items to output this cycle
I0321 01:30:38.685123 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 01:30:38.685129 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 01:30:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:30:43.410680 543705 memory.go:191] Add success.
I0321 01:30:43.409793 543705 cpu.go:282] Add success.
I0321 01:30:43.419829 543705 net.go:648] Add success.
I0321 01:30:43.422346 543705 net.go:770] primary dev: ETH0
I0321 01:30:43.422360 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:30:43.422372 543705 net.go:698] Add success.
I0321 01:30:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:30:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:30:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:30:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:30:53.409794 543705 memory.go:184] no items to output this cycle
I0321 01:30:53.409804 543705 cpu.go:275] no items to output this cycle
E0321 01:31:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:31:03.409783 543705 memory.go:184] no items to output this cycle
I0321 01:31:03.409787 543705 cpu.go:275] no items to output this cycle
E0321 01:31:13.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:31:13.409807 543705 memory.go:191] Add success.
I0321 01:31:13.409809 543705 cpu.go:282] Add success.
W0321 01:31:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:31:13.409847 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:31:13.409850 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:31:13.420282 543705 net.go:648] Add success.
I0321 01:31:13.423162 543705 net.go:770] primary dev: ETH0
I0321 01:31:13.423175 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:31:13.423188 543705 net.go:698] Add success.
I0321 01:31:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:31:14.455154 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:31:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0321 01:31:14.455168 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:31:14.456550 543705 disk_worker.go:494] system disk:vda1
I0321 01:31:14.456592 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:31:15.455955 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:31:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:31:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:31:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:31:16.472431 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:31:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:31:23.409780 543705 memory.go:184] no items to output this cycle
I0321 01:31:23.409783 543705 cpu.go:275] no items to output this cycle
I0321 01:31:27.093677 543705 disk_info.go:125] begin check local disk info of client
I0321 01:31:27.096154 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:31:27.096160 543705 disk_info.go:196] parse disk info done, disk is : [0xc000357d40 0xc000357d80]
E0321 01:31:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:31:33.409781 543705 memory.go:184] no items to output this cycle
I0321 01:31:33.409783 543705 cpu.go:275] no items to output this cycle
E0321 01:31:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:31:43.409911 543705 memory.go:191] Add success.
I0321 01:31:43.409942 543705 cpu.go:282] Add success.
I0321 01:31:43.419708 543705 net.go:648] Add success.
I0321 01:31:43.422359 543705 net.go:770] primary dev: ETH0
I0321 01:31:43.422372 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:31:43.422384 543705 net.go:698] Add success.
I0321 01:31:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:31:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:31:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:31:53.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:31:53.409764 543705 memory.go:184] no items to output this cycle
I0321 01:31:53.409798 543705 cpu.go:275] no items to output this cycle
E0321 01:32:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:32:03.409767 543705 memory.go:184] no items to output this cycle
I0321 01:32:03.409788 543705 cpu.go:275] no items to output this cycle
E0321 01:32:13.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:32:13.409825 543705 memory.go:191] Add success.
I0321 01:32:13.409826 543705 cpu.go:282] Add success.
W0321 01:32:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:32:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:32:13.409868 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:32:13.420321 543705 net.go:648] Add success.
I0321 01:32:13.423130 543705 net.go:770] primary dev: ETH0
I0321 01:32:13.423144 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:32:13.423157 543705 net.go:698] Add success.
W0321 01:32:14.455123 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:32:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0321 01:32:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:32:14.456791 543705 disk_worker.go:494] system disk:vda1
I0321 01:32:14.456833 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 01:32:14.457144 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 01:32:14.457152 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 01:32:14.457157 543705 custom_config.go:64] query custom config with name: gpu
E0321 01:32:15.456839 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 01:32:15.456848 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:32:16.457915 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 01:32:16.457915 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 01:32:16.457969 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:32:16.457989 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:32:16.472319 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:32:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:32:23.409808 543705 memory.go:184] no items to output this cycle
I0321 01:32:23.409822 543705 cpu.go:275] no items to output this cycle
I0321 01:32:27.097684 543705 disk_info.go:125] begin check local disk info of client
I0321 01:32:27.100304 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:32:27.100310 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b0640 0xc0003b0680]
E0321 01:32:33.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:32:33.409783 543705 memory.go:184] no items to output this cycle
I0321 01:32:33.409784 543705 cpu.go:275] no items to output this cycle
E0321 01:32:43.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:32:43.409777 543705 memory.go:191] Add success.
I0321 01:32:43.409806 543705 cpu.go:282] Add success.
I0321 01:32:43.420175 543705 net.go:648] Add success.
I0321 01:32:43.422796 543705 net.go:770] primary dev: ETH0
I0321 01:32:43.422809 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:32:43.422821 543705 net.go:698] Add success.
I0321 01:32:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:32:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:32:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:32:53.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:32:53.409772 543705 memory.go:184] no items to output this cycle
I0321 01:32:53.409793 543705 cpu.go:275] no items to output this cycle
E0321 01:33:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:33:03.409779 543705 cpu.go:275] no items to output this cycle
I0321 01:33:03.409780 543705 memory.go:184] no items to output this cycle
E0321 01:33:13.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:33:13.409826 543705 memory.go:191] Add success.
I0321 01:33:13.409831 543705 cpu.go:282] Add success.
W0321 01:33:13.409856 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:33:13.409875 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:33:13.409878 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:33:13.420215 543705 net.go:648] Add success.
I0321 01:33:13.423274 543705 net.go:770] primary dev: ETH0
I0321 01:33:13.423289 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:33:13.423303 543705 net.go:698] Add success.
I0321 01:33:13.463510 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0e110d34-8736-4482-aece-ed37b2245470","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 01:33:13.463547 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 01:33:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:33:14.455181 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:33:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0321 01:33:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:33:14.456723 543705 disk_worker.go:494] system disk:vda1
I0321 01:33:14.456753 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:33:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:33:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:33:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:33:16.458057 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:33:16.472377 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:33:23.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:33:23.409791 543705 cpu.go:275] no items to output this cycle
I0321 01:33:23.409795 543705 memory.go:184] no items to output this cycle
I0321 01:33:27.101675 543705 disk_info.go:125] begin check local disk info of client
I0321 01:33:27.104203 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:33:27.104213 543705 disk_info.go:196] parse disk info done, disk is : [0xc00024a740 0xc00024a780]
E0321 01:33:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:33:33.409801 543705 memory.go:184] no items to output this cycle
I0321 01:33:33.409814 543705 cpu.go:275] no items to output this cycle
I0321 01:33:38.685731 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 01:33:38.685738 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 01:33:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:33:43.410686 543705 memory.go:191] Add success.
I0321 01:33:43.409792 543705 cpu.go:282] Add success.
I0321 01:33:43.420663 543705 net.go:648] Add success.
I0321 01:33:43.423453 543705 net.go:770] primary dev: ETH0
I0321 01:33:43.423466 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:33:43.423478 543705 net.go:698] Add success.
I0321 01:33:46.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:33:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:33:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:33:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:33:53.409785 543705 cpu.go:275] no items to output this cycle
I0321 01:33:53.409798 543705 memory.go:184] no items to output this cycle
E0321 01:34:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:34:03.409782 543705 memory.go:184] no items to output this cycle
I0321 01:34:03.409784 543705 cpu.go:275] no items to output this cycle
E0321 01:34:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:34:13.409792 543705 memory.go:191] Add success.
I0321 01:34:13.409818 543705 cpu.go:282] Add success.
W0321 01:34:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:34:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:34:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:34:13.420114 543705 net.go:648] Add success.
I0321 01:34:13.422672 543705 net.go:770] primary dev: ETH0
I0321 01:34:13.422685 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:34:13.422695 543705 net.go:698] Add success.
I0321 01:34:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:34:14.455118 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:34:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0321 01:34:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:34:14.456605 543705 disk_worker.go:494] system disk:vda1
I0321 01:34:14.456634 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:34:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:34:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:34:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:34:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:34:16.472388 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:34:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:34:23.409782 543705 memory.go:184] no items to output this cycle
I0321 01:34:23.409797 543705 cpu.go:275] no items to output this cycle
I0321 01:34:27.105673 543705 disk_info.go:125] begin check local disk info of client
I0321 01:34:27.108212 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:34:27.108218 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003aec00 0xc0003aec40]
E0321 01:34:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:34:33.409786 543705 cpu.go:275] no items to output this cycle
I0321 01:34:33.409788 543705 memory.go:184] no items to output this cycle
E0321 01:34:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:34:43.409809 543705 memory.go:191] Add success.
I0321 01:34:43.409817 543705 cpu.go:282] Add success.
I0321 01:34:43.419951 543705 net.go:648] Add success.
I0321 01:34:43.423003 543705 net.go:770] primary dev: ETH0
I0321 01:34:43.423018 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:34:43.423036 543705 net.go:698] Add success.
I0321 01:34:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:34:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:34:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:34:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:34:53.409780 543705 memory.go:184] no items to output this cycle
I0321 01:34:53.409782 543705 cpu.go:275] no items to output this cycle
E0321 01:35:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:35:03.409792 543705 memory.go:184] no items to output this cycle
I0321 01:35:03.409806 543705 cpu.go:275] no items to output this cycle
E0321 01:35:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:35:13.409793 543705 memory.go:191] Add success.
W0321 01:35:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0321 01:35:13.409819 543705 cpu.go:282] Add success.
W0321 01:35:13.409831 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:35:13.409834 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:35:13.420156 543705 net.go:648] Add success.
I0321 01:35:13.423135 543705 net.go:770] primary dev: ETH0
I0321 01:35:13.423149 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:35:13.423161 543705 net.go:698] Add success.
I0321 01:35:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:35:14.455131 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:35:14.455211 543705 disk_worker.go:708] disk space is not compliant
W0321 01:35:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:35:14.456595 543705 disk_worker.go:494] system disk:vda1
I0321 01:35:14.456625 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:35:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:35:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:35:16.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:35:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:35:16.472353 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:35:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:35:23.409789 543705 memory.go:184] no items to output this cycle
I0321 01:35:23.409790 543705 cpu.go:275] no items to output this cycle
I0321 01:35:27.109682 543705 disk_info.go:125] begin check local disk info of client
I0321 01:35:27.112141 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:35:27.112148 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ad580 0xc0003ad5c0]
E0321 01:35:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:35:33.409802 543705 memory.go:184] no items to output this cycle
I0321 01:35:33.409816 543705 cpu.go:275] no items to output this cycle
E0321 01:35:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:35:43.409783 543705 memory.go:191] Add success.
I0321 01:35:43.409810 543705 cpu.go:282] Add success.
I0321 01:35:43.419881 543705 net.go:648] Add success.
I0321 01:35:43.422480 543705 net.go:770] primary dev: ETH0
I0321 01:35:43.422496 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:35:43.422511 543705 net.go:698] Add success.
I0321 01:35:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:35:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:35:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:35:53.409919 543705 cpu.go:275] no items to output this cycle
E0321 01:35:53.409938 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:35:53.409953 543705 memory.go:184] no items to output this cycle
E0321 01:36:03.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:36:03.409803 543705 memory.go:184] no items to output this cycle
I0321 01:36:03.409817 543705 cpu.go:275] no items to output this cycle
E0321 01:36:13.409804 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:36:13.409840 543705 memory.go:191] Add success.
I0321 01:36:13.409854 543705 cpu.go:282] Add success.
W0321 01:36:13.409872 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:36:13.409888 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:36:13.409892 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:36:13.420187 543705 net.go:648] Add success.
I0321 01:36:13.422722 543705 net.go:770] primary dev: ETH0
I0321 01:36:13.422738 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:36:13.422752 543705 net.go:698] Add success.
I0321 01:36:13.469808 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"413d75d0-050a-4273-aad4-147515791c2b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 01:36:13.469842 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 01:36:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:36:14.455171 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:36:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0321 01:36:14.455184 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:36:14.456645 543705 disk_worker.go:494] system disk:vda1
I0321 01:36:14.456677 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:36:15.455607 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:36:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:36:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:36:16.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:36:16.472370 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:36:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:36:23.409789 543705 memory.go:184] no items to output this cycle
I0321 01:36:23.409812 543705 cpu.go:275] no items to output this cycle
I0321 01:36:27.113677 543705 disk_info.go:125] begin check local disk info of client
I0321 01:36:27.116184 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:36:27.116190 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003af880 0xc0003af8c0]
E0321 01:36:33.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:36:33.409806 543705 memory.go:184] no items to output this cycle
I0321 01:36:33.409820 543705 cpu.go:275] no items to output this cycle
I0321 01:36:38.687139 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 01:36:38.687146 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 01:36:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:36:43.410685 543705 memory.go:191] Add success.
I0321 01:36:43.409813 543705 cpu.go:282] Add success.
I0321 01:36:43.420440 543705 net.go:648] Add success.
I0321 01:36:43.422990 543705 net.go:770] primary dev: ETH0
I0321 01:36:43.423004 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:36:43.423017 543705 net.go:698] Add success.
I0321 01:36:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:36:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:36:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:36:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:36:53.409786 543705 memory.go:184] no items to output this cycle
I0321 01:36:53.409790 543705 cpu.go:275] no items to output this cycle
E0321 01:37:03.409892 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:37:03.409908 543705 memory.go:184] no items to output this cycle
I0321 01:37:03.409933 543705 cpu.go:275] no items to output this cycle
E0321 01:37:13.409806 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:37:13.409842 543705 memory.go:191] Add success.
I0321 01:37:13.409853 543705 cpu.go:282] Add success.
W0321 01:37:13.409873 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:37:13.409889 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:37:13.409892 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:37:13.420370 543705 net.go:648] Add success.
I0321 01:37:13.423024 543705 net.go:770] primary dev: ETH0
I0321 01:37:13.423037 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:37:13.423050 543705 net.go:698] Add success.
I0321 01:37:13.453572 543705 event_worker.go:152] Polling the log file for events...
W0321 01:37:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:37:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0321 01:37:14.455170 543705 disk_worker.go:728] disk inode is not compliant
E0321 01:37:14.456982 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 01:37:14.456992 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 01:37:14.456999 543705 custom_config.go:64] query custom config with name: gpu
I0321 01:37:14.457031 543705 disk_worker.go:494] system disk:vda1
I0321 01:37:14.457060 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 01:37:15.456814 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 01:37:15.456822 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:37:16.457952 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 01:37:16.457951 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 01:37:16.458006 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:37:16.458027 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:37:16.472354 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:37:23.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:37:23.409816 543705 memory.go:184] no items to output this cycle
I0321 01:37:23.409824 543705 cpu.go:275] no items to output this cycle
I0321 01:37:27.117674 543705 disk_info.go:125] begin check local disk info of client
I0321 01:37:27.120392 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:37:27.120398 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ac440 0xc0003ac480]
E0321 01:37:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:37:33.409781 543705 memory.go:184] no items to output this cycle
I0321 01:37:33.409786 543705 cpu.go:275] no items to output this cycle
E0321 01:37:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:37:43.409817 543705 memory.go:191] Add success.
I0321 01:37:43.409827 543705 cpu.go:282] Add success.
I0321 01:37:43.419898 543705 net.go:648] Add success.
I0321 01:37:43.422586 543705 net.go:770] primary dev: ETH0
I0321 01:37:43.422601 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:37:43.422613 543705 net.go:698] Add success.
I0321 01:37:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:37:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:37:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:37:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:37:53.409783 543705 memory.go:184] no items to output this cycle
I0321 01:37:53.409788 543705 cpu.go:275] no items to output this cycle
E0321 01:38:03.409745 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:38:03.409760 543705 memory.go:184] no items to output this cycle
I0321 01:38:03.409795 543705 cpu.go:275] no items to output this cycle
E0321 01:38:13.409814 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:38:13.409863 543705 memory.go:191] Add success.
I0321 01:38:13.409869 543705 cpu.go:282] Add success.
W0321 01:38:13.409898 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:38:13.409914 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:38:13.409918 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:38:13.419724 543705 net.go:648] Add success.
I0321 01:38:13.422919 543705 net.go:770] primary dev: ETH0
I0321 01:38:13.422932 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:38:13.422943 543705 net.go:698] Add success.
I0321 01:38:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:38:14.455098 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:38:14.455178 543705 disk_worker.go:708] disk space is not compliant
W0321 01:38:14.455181 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:38:14.456554 543705 disk_worker.go:494] system disk:vda1
I0321 01:38:14.456584 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:38:15.455951 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:38:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:38:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:38:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:38:16.472397 543705 disk_local_worker.go:436] Get disk info: []
I0321 01:38:23.409803 543705 cpu.go:275] no items to output this cycle
E0321 01:38:23.409804 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:38:23.409825 543705 memory.go:184] no items to output this cycle
I0321 01:38:27.121678 543705 disk_info.go:125] begin check local disk info of client
I0321 01:38:27.124190 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:38:27.124197 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb500 0xc0001fb540]
E0321 01:38:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:38:33.409774 543705 memory.go:184] no items to output this cycle
I0321 01:38:33.409806 543705 cpu.go:275] no items to output this cycle
E0321 01:38:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:38:43.409783 543705 memory.go:191] Add success.
I0321 01:38:43.409796 543705 cpu.go:282] Add success.
I0321 01:38:43.420026 543705 net.go:648] Add success.
I0321 01:38:43.422786 543705 net.go:770] primary dev: ETH0
I0321 01:38:43.422799 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:38:43.422811 543705 net.go:698] Add success.
I0321 01:38:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:38:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:38:46.458057 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:38:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:38:53.409772 543705 memory.go:184] no items to output this cycle
I0321 01:38:53.409806 543705 cpu.go:275] no items to output this cycle
E0321 01:39:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:39:03.409772 543705 memory.go:184] no items to output this cycle
I0321 01:39:03.409806 543705 cpu.go:275] no items to output this cycle
E0321 01:39:13.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:39:13.409831 543705 memory.go:191] Add success.
W0321 01:39:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:39:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:39:13.409876 543705 cpu.go:282] Add success.
I0321 01:39:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:39:13.420512 543705 net.go:648] Add success.
I0321 01:39:13.423396 543705 net.go:770] primary dev: ETH0
I0321 01:39:13.423412 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:39:13.423427 543705 net.go:698] Add success.
I0321 01:39:13.470002 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"576ef737-9e83-4d63-9091-ad9678915e90","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 01:39:13.470045 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 01:39:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:39:14.455162 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:39:14.455235 543705 disk_worker.go:708] disk space is not compliant
W0321 01:39:14.455238 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:39:14.456835 543705 disk_worker.go:494] system disk:vda1
I0321 01:39:14.456872 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:39:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:39:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:39:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:39:16.458073 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:39:16.472403 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:39:23.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:39:23.409789 543705 cpu.go:275] no items to output this cycle
I0321 01:39:23.409792 543705 memory.go:184] no items to output this cycle
I0321 01:39:27.125672 543705 disk_info.go:125] begin check local disk info of client
I0321 01:39:27.128468 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:39:27.128474 543705 disk_info.go:196] parse disk info done, disk is : [0xc000315640 0xc000315680]
E0321 01:39:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:39:33.409778 543705 memory.go:184] no items to output this cycle
I0321 01:39:33.409781 543705 cpu.go:275] no items to output this cycle
I0321 01:39:38.688136 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 01:39:38.688142 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 01:39:43.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:39:43.410661 543705 memory.go:191] Add success.
I0321 01:39:43.409806 543705 cpu.go:282] Add success.
I0321 01:39:43.420337 543705 net.go:648] Add success.
I0321 01:39:43.422917 543705 net.go:770] primary dev: ETH0
I0321 01:39:43.422930 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:39:43.422955 543705 net.go:698] Add success.
I0321 01:39:46.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:39:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:39:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:39:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:39:53.409769 543705 memory.go:184] no items to output this cycle
I0321 01:39:53.409790 543705 cpu.go:275] no items to output this cycle
E0321 01:40:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:40:03.409772 543705 memory.go:184] no items to output this cycle
I0321 01:40:03.409785 543705 cpu.go:275] no items to output this cycle
E0321 01:40:13.409921 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:40:13.410059 543705 memory.go:191] Add success.
W0321 01:40:13.410098 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0321 01:40:13.410113 543705 cpu.go:282] Add success.
W0321 01:40:13.410118 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:40:13.410128 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:40:13.419702 543705 net.go:648] Add success.
I0321 01:40:13.422416 543705 net.go:770] primary dev: ETH0
I0321 01:40:13.422435 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:40:13.422454 543705 net.go:698] Add success.
I0321 01:40:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:40:14.455234 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:40:14.455247 543705 disk_worker.go:708] disk space is not compliant
W0321 01:40:14.455250 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:40:14.456636 543705 disk_worker.go:494] system disk:vda1
I0321 01:40:14.456681 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:40:15.455982 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:40:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:40:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:40:16.458060 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:40:16.472383 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:40:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:40:23.409784 543705 cpu.go:275] no items to output this cycle
I0321 01:40:23.409786 543705 memory.go:184] no items to output this cycle
I0321 01:40:27.129676 543705 disk_info.go:125] begin check local disk info of client
I0321 01:40:27.132199 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:40:27.132207 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aa300 0xc0001aa340]
E0321 01:40:33.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:40:33.409791 543705 memory.go:184] no items to output this cycle
I0321 01:40:33.409803 543705 cpu.go:275] no items to output this cycle
E0321 01:40:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:40:43.409779 543705 memory.go:191] Add success.
I0321 01:40:43.409797 543705 cpu.go:282] Add success.
I0321 01:40:43.419886 543705 net.go:648] Add success.
I0321 01:40:43.422485 543705 net.go:770] primary dev: ETH0
I0321 01:40:43.422501 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:40:43.422515 543705 net.go:698] Add success.
I0321 01:40:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:40:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:40:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:40:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:40:53.409774 543705 cpu.go:275] no items to output this cycle
I0321 01:40:53.409776 543705 memory.go:184] no items to output this cycle
E0321 01:41:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:41:03.409779 543705 memory.go:184] no items to output this cycle
I0321 01:41:03.409786 543705 cpu.go:275] no items to output this cycle
E0321 01:41:13.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:41:13.409835 543705 memory.go:191] Add success.
I0321 01:41:13.409841 543705 cpu.go:282] Add success.
W0321 01:41:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:41:13.409883 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:41:13.409887 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:41:13.420385 543705 net.go:648] Add success.
I0321 01:41:13.423328 543705 net.go:770] primary dev: ETH0
I0321 01:41:13.423343 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:41:13.423357 543705 net.go:698] Add success.
I0321 01:41:14.454979 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:41:14.455201 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:41:14.455216 543705 disk_worker.go:708] disk space is not compliant
W0321 01:41:14.455219 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:41:14.456557 543705 disk_worker.go:494] system disk:vda1
I0321 01:41:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:41:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:41:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:41:16.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:41:16.458074 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:41:16.472410 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:41:23.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:41:23.409776 543705 memory.go:184] no items to output this cycle
I0321 01:41:23.409803 543705 cpu.go:275] no items to output this cycle
I0321 01:41:27.133676 543705 disk_info.go:125] begin check local disk info of client
I0321 01:41:27.136186 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:41:27.136192 543705 disk_info.go:196] parse disk info done, disk is : [0xc000315200 0xc000315240]
E0321 01:41:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:41:33.409793 543705 memory.go:184] no items to output this cycle
I0321 01:41:33.409806 543705 cpu.go:275] no items to output this cycle
E0321 01:41:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:41:43.409778 543705 memory.go:191] Add success.
I0321 01:41:43.409801 543705 cpu.go:282] Add success.
I0321 01:41:43.419856 543705 net.go:648] Add success.
I0321 01:41:43.422766 543705 net.go:770] primary dev: ETH0
I0321 01:41:43.422779 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:41:43.422791 543705 net.go:698] Add success.
I0321 01:41:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:41:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:41:46.458082 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:41:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:41:53.409797 543705 memory.go:184] no items to output this cycle
I0321 01:41:53.409811 543705 cpu.go:275] no items to output this cycle
E0321 01:42:03.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:42:03.409766 543705 memory.go:184] no items to output this cycle
I0321 01:42:03.409800 543705 cpu.go:275] no items to output this cycle
E0321 01:42:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:42:13.409819 543705 memory.go:191] Add success.
I0321 01:42:13.409837 543705 cpu.go:282] Add success.
W0321 01:42:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:42:13.409866 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:42:13.409870 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:42:13.420338 543705 net.go:648] Add success.
I0321 01:42:13.423023 543705 net.go:770] primary dev: ETH0
I0321 01:42:13.423037 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:42:13.423050 543705 net.go:698] Add success.
I0321 01:42:13.469536 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"30b18671-4a68-4e47-9f6f-4d80307b43cf","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 01:42:13.469582 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0321 01:42:14.455153 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:42:14.455221 543705 disk_worker.go:708] disk space is not compliant
W0321 01:42:14.455225 543705 disk_worker.go:728] disk inode is not compliant
E0321 01:42:14.456081 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 01:42:14.456091 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 01:42:14.456098 543705 custom_config.go:64] query custom config with name: gpu
I0321 01:42:14.456644 543705 disk_worker.go:494] system disk:vda1
I0321 01:42:14.456679 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 01:42:15.456814 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 01:42:15.456824 543705 custom_config.go:64] query custom config with name: huawei_npu
E0321 01:42:16.457929 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 01:42:16.457929 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:42:16.457987 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:42:16.458007 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:42:16.472328 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:42:23.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:42:23.409777 543705 memory.go:184] no items to output this cycle
I0321 01:42:23.409809 543705 cpu.go:275] no items to output this cycle
I0321 01:42:27.137673 543705 disk_info.go:125] begin check local disk info of client
I0321 01:42:27.140199 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:42:27.140206 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002b2140 0xc0002b2180]
E0321 01:42:33.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:42:33.409764 543705 memory.go:184] no items to output this cycle
I0321 01:42:33.409799 543705 cpu.go:275] no items to output this cycle
I0321 01:42:38.689143 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 01:42:38.689150 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 01:42:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:42:43.410608 543705 memory.go:191] Add success.
I0321 01:42:43.409786 543705 cpu.go:282] Add success.
I0321 01:42:43.420313 543705 net.go:648] Add success.
I0321 01:42:43.423305 543705 net.go:770] primary dev: ETH0
I0321 01:42:43.423320 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:42:43.423334 543705 net.go:698] Add success.
I0321 01:42:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:42:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:42:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:42:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:42:53.409800 543705 memory.go:184] no items to output this cycle
I0321 01:42:53.409811 543705 cpu.go:275] no items to output this cycle
E0321 01:43:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:43:03.409777 543705 memory.go:184] no items to output this cycle
I0321 01:43:03.409782 543705 cpu.go:275] no items to output this cycle
E0321 01:43:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:43:13.409777 543705 memory.go:191] Add success.
W0321 01:43:13.409802 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:43:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:43:13.409816 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:43:13.409838 543705 cpu.go:282] Add success.
I0321 01:43:13.420565 543705 net.go:648] Add success.
I0321 01:43:13.421533 543705 net.go:770] primary dev: ETH0
I0321 01:43:13.421551 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:43:13.421569 543705 net.go:698] Add success.
I0321 01:43:14.454982 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:43:14.455124 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:43:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0321 01:43:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:43:14.456658 543705 disk_worker.go:494] system disk:vda1
I0321 01:43:14.456692 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:43:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:43:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:43:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:43:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:43:16.472419 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:43:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:43:23.409780 543705 cpu.go:275] no items to output this cycle
I0321 01:43:23.409794 543705 memory.go:184] no items to output this cycle
I0321 01:43:27.141675 543705 disk_info.go:125] begin check local disk info of client
I0321 01:43:27.144147 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:43:27.144154 543705 disk_info.go:196] parse disk info done, disk is : [0xc00056d580 0xc00056d5c0]
E0321 01:43:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:43:33.409795 543705 memory.go:184] no items to output this cycle
I0321 01:43:33.409812 543705 cpu.go:275] no items to output this cycle
E0321 01:43:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:43:43.409789 543705 cpu.go:282] Add success.
I0321 01:43:43.409802 543705 memory.go:191] Add success.
I0321 01:43:43.420094 543705 net.go:648] Add success.
I0321 01:43:43.422786 543705 net.go:770] primary dev: ETH0
I0321 01:43:43.422800 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:43:43.422811 543705 net.go:698] Add success.
I0321 01:43:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:43:46.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:43:46.458058 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:43:53.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:43:53.409780 543705 memory.go:184] no items to output this cycle
I0321 01:43:53.409798 543705 cpu.go:275] no items to output this cycle
E0321 01:44:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:44:03.409770 543705 memory.go:184] no items to output this cycle
I0321 01:44:03.409801 543705 cpu.go:275] no items to output this cycle
E0321 01:44:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:44:13.409788 543705 memory.go:191] Add success.
W0321 01:44:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:44:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:44:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:44:13.409843 543705 cpu.go:282] Add success.
I0321 01:44:13.420333 543705 net.go:648] Add success.
I0321 01:44:13.421259 543705 net.go:770] primary dev: ETH0
I0321 01:44:13.421278 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:44:13.421298 543705 net.go:698] Add success.
I0321 01:44:14.454993 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:44:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:44:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0321 01:44:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:44:14.456550 543705 disk_worker.go:494] system disk:vda1
I0321 01:44:14.456595 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:44:15.455953 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:44:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:44:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:44:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:44:16.472378 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:44:23.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:44:23.409774 543705 memory.go:184] no items to output this cycle
I0321 01:44:23.409811 543705 cpu.go:275] no items to output this cycle
I0321 01:44:27.145678 543705 disk_info.go:125] begin check local disk info of client
I0321 01:44:27.148224 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:44:27.148231 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b600 0xc00007b640]
E0321 01:44:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:44:33.409783 543705 memory.go:184] no items to output this cycle
I0321 01:44:33.409789 543705 cpu.go:275] no items to output this cycle
E0321 01:44:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:44:43.409784 543705 memory.go:191] Add success.
I0321 01:44:43.409787 543705 cpu.go:282] Add success.
I0321 01:44:43.420100 543705 net.go:648] Add success.
I0321 01:44:43.423278 543705 net.go:770] primary dev: ETH0
I0321 01:44:43.423291 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:44:43.423303 543705 net.go:698] Add success.
I0321 01:44:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:44:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:44:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:44:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:44:53.409773 543705 memory.go:184] no items to output this cycle
I0321 01:44:53.409786 543705 cpu.go:275] no items to output this cycle
E0321 01:45:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:45:03.409794 543705 memory.go:184] no items to output this cycle
I0321 01:45:03.409807 543705 cpu.go:275] no items to output this cycle
E0321 01:45:13.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:45:13.409789 543705 memory.go:191] Add success.
I0321 01:45:13.409789 543705 cpu.go:282] Add success.
W0321 01:45:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:45:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:45:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:45:13.420070 543705 net.go:648] Add success.
I0321 01:45:13.422905 543705 net.go:770] primary dev: ETH0
I0321 01:45:13.422921 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:45:13.422936 543705 net.go:698] Add success.
I0321 01:45:13.463944 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"f8ebef6d-ba6b-47e9-a7b0-b84d4c013369","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 01:45:13.463977 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 01:45:14.454994 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:45:14.455214 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:45:14.455227 543705 disk_worker.go:708] disk space is not compliant
W0321 01:45:14.455230 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:45:14.456636 543705 disk_worker.go:494] system disk:vda1
I0321 01:45:14.456671 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:45:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:45:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:45:16.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:45:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:45:16.472381 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:45:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:45:23.409800 543705 memory.go:184] no items to output this cycle
I0321 01:45:23.409809 543705 cpu.go:275] no items to output this cycle
I0321 01:45:27.149674 543705 disk_info.go:125] begin check local disk info of client
I0321 01:45:27.152204 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:45:27.152211 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039ba40 0xc00039ba80]
E0321 01:45:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:45:33.409802 543705 memory.go:184] no items to output this cycle
I0321 01:45:33.409814 543705 cpu.go:275] no items to output this cycle
I0321 01:45:38.689733 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 01:45:38.689740 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 01:45:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:45:43.410619 543705 memory.go:191] Add success.
I0321 01:45:43.409792 543705 cpu.go:282] Add success.
I0321 01:45:43.420308 543705 net.go:648] Add success.
I0321 01:45:43.422790 543705 net.go:770] primary dev: ETH0
I0321 01:45:43.422803 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:45:43.422815 543705 net.go:698] Add success.
I0321 01:45:46.457767 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:45:46.457826 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:45:46.457850 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:45:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:45:53.409776 543705 memory.go:184] no items to output this cycle
I0321 01:45:53.409814 543705 cpu.go:275] no items to output this cycle
E0321 01:46:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:46:03.409798 543705 memory.go:184] no items to output this cycle
I0321 01:46:03.409810 543705 cpu.go:275] no items to output this cycle
E0321 01:46:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:46:13.409811 543705 memory.go:191] Add success.
I0321 01:46:13.409821 543705 cpu.go:282] Add success.
W0321 01:46:13.409842 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:46:13.409853 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:46:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:46:13.420190 543705 net.go:648] Add success.
I0321 01:46:13.422862 543705 net.go:770] primary dev: ETH0
I0321 01:46:13.422875 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:46:13.422886 543705 net.go:698] Add success.
I0321 01:46:14.454990 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:46:14.455132 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:46:14.455222 543705 disk_worker.go:708] disk space is not compliant
W0321 01:46:14.455225 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:46:14.456654 543705 disk_worker.go:494] system disk:vda1
I0321 01:46:14.456690 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:46:15.455959 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:46:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:46:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:46:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:46:16.472440 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:46:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:46:23.409788 543705 memory.go:184] no items to output this cycle
I0321 01:46:23.409793 543705 cpu.go:275] no items to output this cycle
I0321 01:46:27.153673 543705 disk_info.go:125] begin check local disk info of client
I0321 01:46:27.156203 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:46:27.156211 543705 disk_info.go:196] parse disk info done, disk is : [0xc00056c200 0xc00056c240]
E0321 01:46:33.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:46:33.409760 543705 memory.go:184] no items to output this cycle
I0321 01:46:33.409797 543705 cpu.go:275] no items to output this cycle
E0321 01:46:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:46:43.409808 543705 memory.go:191] Add success.
I0321 01:46:43.409813 543705 cpu.go:282] Add success.
I0321 01:46:43.419956 543705 net.go:648] Add success.
I0321 01:46:43.422675 543705 net.go:770] primary dev: ETH0
I0321 01:46:43.422688 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:46:43.422861 543705 net.go:698] Add success.
I0321 01:46:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:46:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:46:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:46:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:46:53.409771 543705 memory.go:184] no items to output this cycle
I0321 01:46:53.409786 543705 cpu.go:275] no items to output this cycle
E0321 01:47:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:47:03.409768 543705 memory.go:184] no items to output this cycle
I0321 01:47:03.409802 543705 cpu.go:275] no items to output this cycle
E0321 01:47:13.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:47:13.409781 543705 memory.go:191] Add success.
I0321 01:47:13.409789 543705 cpu.go:282] Add success.
W0321 01:47:13.409807 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:47:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:47:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:47:13.420504 543705 net.go:648] Add success.
I0321 01:47:13.421525 543705 net.go:770] primary dev: ETH0
I0321 01:47:13.421539 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:47:13.421553 543705 net.go:698] Add success.
I0321 01:47:13.453112 543705 event_worker.go:152] Polling the log file for events...
W0321 01:47:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:47:14.455223 543705 disk_worker.go:708] disk space is not compliant
W0321 01:47:14.455227 543705 disk_worker.go:728] disk inode is not compliant
E0321 01:47:14.456066 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 01:47:14.456075 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 01:47:14.456082 543705 custom_config.go:64] query custom config with name: gpu
I0321 01:47:14.456633 543705 disk_worker.go:494] system disk:vda1
I0321 01:47:14.456677 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 01:47:15.456814 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 01:47:15.456826 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:47:16.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 01:47:16.457990 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 01:47:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:47:16.458067 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:47:16.472423 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:47:23.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:47:23.409811 543705 memory.go:184] no items to output this cycle
I0321 01:47:23.409823 543705 cpu.go:275] no items to output this cycle
I0321 01:47:27.157683 543705 disk_info.go:125] begin check local disk info of client
I0321 01:47:27.160192 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:47:27.160198 543705 disk_info.go:196] parse disk info done, disk is : [0xc000323800 0xc000323840]
E0321 01:47:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:47:33.409798 543705 memory.go:184] no items to output this cycle
I0321 01:47:33.409816 543705 cpu.go:275] no items to output this cycle
E0321 01:47:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:47:43.409788 543705 memory.go:191] Add success.
I0321 01:47:43.409805 543705 cpu.go:282] Add success.
I0321 01:47:43.419857 543705 net.go:648] Add success.
I0321 01:47:43.423220 543705 net.go:770] primary dev: ETH0
I0321 01:47:43.423233 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:47:43.423246 543705 net.go:698] Add success.
I0321 01:47:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:47:46.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:47:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:47:53.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:47:53.409781 543705 memory.go:184] no items to output this cycle
I0321 01:47:53.409793 543705 cpu.go:275] no items to output this cycle
E0321 01:48:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:48:03.409788 543705 memory.go:184] no items to output this cycle
I0321 01:48:03.409804 543705 cpu.go:275] no items to output this cycle
E0321 01:48:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:48:13.409783 543705 memory.go:191] Add success.
I0321 01:48:13.409803 543705 cpu.go:282] Add success.
W0321 01:48:13.409810 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:48:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:48:13.409824 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:48:13.420076 543705 net.go:648] Add success.
I0321 01:48:13.422739 543705 net.go:770] primary dev: ETH0
I0321 01:48:13.422755 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:48:13.422768 543705 net.go:698] Add success.
I0321 01:48:13.743551 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"cdd90b74-292f-4e5a-8125-c278efb2e7d3","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 01:48:13.743596 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 01:48:14.454720 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:48:14.454851 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:48:14.454916 543705 disk_worker.go:708] disk space is not compliant
W0321 01:48:14.454919 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:48:14.456272 543705 disk_worker.go:494] system disk:vda1
I0321 01:48:14.456332 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:48:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:48:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:48:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:48:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:48:16.472442 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:48:23.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:48:23.409786 543705 memory.go:184] no items to output this cycle
I0321 01:48:23.409808 543705 cpu.go:275] no items to output this cycle
I0321 01:48:27.161687 543705 disk_info.go:125] begin check local disk info of client
I0321 01:48:27.164264 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:48:27.164271 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ad800 0xc0003ad840]
E0321 01:48:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:48:33.409773 543705 memory.go:184] no items to output this cycle
I0321 01:48:33.409803 543705 cpu.go:275] no items to output this cycle
I0321 01:48:38.691157 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 01:48:38.691164 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 01:48:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:48:43.410694 543705 memory.go:191] Add success.
I0321 01:48:43.409809 543705 cpu.go:282] Add success.
I0321 01:48:43.420200 543705 net.go:770] primary dev: ETH0
I0321 01:48:43.420212 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:48:43.420224 543705 net.go:698] Add success.
I0321 01:48:43.420590 543705 net.go:648] Add success.
I0321 01:48:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:48:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:48:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:48:53.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:48:53.409803 543705 memory.go:184] no items to output this cycle
I0321 01:48:53.409816 543705 cpu.go:275] no items to output this cycle
E0321 01:49:03.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:49:03.409769 543705 memory.go:184] no items to output this cycle
I0321 01:49:03.409912 543705 cpu.go:275] no items to output this cycle
E0321 01:49:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:49:13.409811 543705 memory.go:191] Add success.
I0321 01:49:13.409825 543705 cpu.go:282] Add success.
W0321 01:49:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:49:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:49:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:49:13.420313 543705 net.go:648] Add success.
I0321 01:49:13.422928 543705 net.go:770] primary dev: ETH0
I0321 01:49:13.422941 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:49:13.422953 543705 net.go:698] Add success.
I0321 01:49:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:49:14.455137 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:49:14.455227 543705 disk_worker.go:708] disk space is not compliant
W0321 01:49:14.455231 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:49:14.456650 543705 disk_worker.go:494] system disk:vda1
I0321 01:49:14.456685 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:49:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:49:16.457966 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:49:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:49:16.458050 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:49:16.472417 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:49:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:49:23.409773 543705 memory.go:184] no items to output this cycle
I0321 01:49:23.409787 543705 cpu.go:275] no items to output this cycle
I0321 01:49:27.165675 543705 disk_info.go:125] begin check local disk info of client
I0321 01:49:27.168157 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:49:27.168163 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ad3c0 0xc0003ad400]
E0321 01:49:33.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:49:33.409789 543705 memory.go:184] no items to output this cycle
I0321 01:49:33.409803 543705 cpu.go:275] no items to output this cycle
E0321 01:49:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:49:43.409787 543705 memory.go:191] Add success.
I0321 01:49:43.409789 543705 cpu.go:282] Add success.
I0321 01:49:43.419860 543705 net.go:648] Add success.
I0321 01:49:43.422693 543705 net.go:770] primary dev: ETH0
I0321 01:49:43.422705 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:49:43.422718 543705 net.go:698] Add success.
I0321 01:49:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:49:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:49:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:49:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:49:53.409791 543705 memory.go:184] no items to output this cycle
I0321 01:49:53.409805 543705 cpu.go:275] no items to output this cycle
E0321 01:50:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:50:03.409907 543705 memory.go:184] no items to output this cycle
I0321 01:50:03.409909 543705 cpu.go:275] no items to output this cycle
E0321 01:50:13.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:50:13.409786 543705 memory.go:191] Add success.
W0321 01:50:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0321 01:50:13.409815 543705 cpu.go:282] Add success.
W0321 01:50:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:50:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:50:13.420202 543705 net.go:648] Add success.
I0321 01:50:13.422707 543705 net.go:770] primary dev: ETH0
I0321 01:50:13.422720 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:50:13.422732 543705 net.go:698] Add success.
I0321 01:50:14.454989 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:50:14.455225 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:50:14.455237 543705 disk_worker.go:708] disk space is not compliant
W0321 01:50:14.455240 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:50:14.456602 543705 disk_worker.go:494] system disk:vda1
I0321 01:50:14.456650 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:50:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:50:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:50:16.458028 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:50:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:50:16.472377 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:50:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:50:23.409808 543705 memory.go:184] no items to output this cycle
I0321 01:50:23.409816 543705 cpu.go:275] no items to output this cycle
I0321 01:50:27.169675 543705 disk_info.go:125] begin check local disk info of client
I0321 01:50:27.172279 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:50:27.172286 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ac900 0xc0003ac940]
E0321 01:50:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:50:33.409800 543705 memory.go:184] no items to output this cycle
I0321 01:50:33.409815 543705 cpu.go:275] no items to output this cycle
E0321 01:50:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:50:43.409780 543705 memory.go:191] Add success.
I0321 01:50:43.409799 543705 cpu.go:282] Add success.
I0321 01:50:43.419882 543705 net.go:648] Add success.
I0321 01:50:43.422370 543705 net.go:770] primary dev: ETH0
I0321 01:50:43.422384 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:50:43.422399 543705 net.go:698] Add success.
I0321 01:50:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:50:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:50:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:50:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:50:53.409784 543705 cpu.go:275] no items to output this cycle
I0321 01:50:53.409786 543705 memory.go:184] no items to output this cycle
E0321 01:51:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:51:03.409776 543705 memory.go:184] no items to output this cycle
I0321 01:51:03.409809 543705 cpu.go:275] no items to output this cycle
E0321 01:51:13.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:51:13.409826 543705 memory.go:191] Add success.
I0321 01:51:13.409833 543705 cpu.go:282] Add success.
W0321 01:51:13.409857 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:51:13.409873 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:51:13.409877 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:51:13.420130 543705 net.go:648] Add success.
I0321 01:51:13.422765 543705 net.go:770] primary dev: ETH0
I0321 01:51:13.422780 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:51:13.422794 543705 net.go:698] Add success.
I0321 01:51:13.468391 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"ff9a5cb0-0e0b-4a03-bd8d-edf6f3240b4a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 01:51:13.468423 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 01:51:14.454994 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:51:14.455232 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:51:14.455246 543705 disk_worker.go:708] disk space is not compliant
W0321 01:51:14.455249 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:51:14.456807 543705 disk_worker.go:494] system disk:vda1
I0321 01:51:14.456838 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:51:15.455977 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:51:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:51:16.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:51:16.458077 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:51:16.472402 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:51:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:51:23.409778 543705 memory.go:184] no items to output this cycle
I0321 01:51:23.409804 543705 cpu.go:275] no items to output this cycle
I0321 01:51:27.173685 543705 disk_info.go:125] begin check local disk info of client
I0321 01:51:27.176175 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:51:27.176182 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ace40 0xc0003ace80]
E0321 01:51:33.409798 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:51:33.409818 543705 memory.go:184] no items to output this cycle
I0321 01:51:33.409828 543705 cpu.go:275] no items to output this cycle
I0321 01:51:38.691303 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 01:51:38.691310 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 01:51:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:51:43.410901 543705 memory.go:191] Add success.
I0321 01:51:43.409830 543705 cpu.go:282] Add success.
I0321 01:51:43.420594 543705 net.go:648] Add success.
I0321 01:51:43.423142 543705 net.go:770] primary dev: ETH0
I0321 01:51:43.423155 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:51:43.423168 543705 net.go:698] Add success.
I0321 01:51:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:51:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:51:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:51:53.410353 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:51:53.410371 543705 memory.go:184] no items to output this cycle
I0321 01:51:53.410403 543705 cpu.go:275] no items to output this cycle
E0321 01:52:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:52:03.409783 543705 memory.go:184] no items to output this cycle
I0321 01:52:03.409792 543705 cpu.go:275] no items to output this cycle
E0321 01:52:13.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:52:13.409833 543705 memory.go:191] Add success.
I0321 01:52:13.409840 543705 cpu.go:282] Add success.
W0321 01:52:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:52:13.409880 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:52:13.409883 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:52:13.420289 543705 net.go:648] Add success.
I0321 01:52:13.423540 543705 net.go:770] primary dev: ETH0
I0321 01:52:13.423553 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:52:13.423565 543705 net.go:698] Add success.
W0321 01:52:14.455146 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:52:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0321 01:52:14.455217 543705 disk_worker.go:728] disk inode is not compliant
E0321 01:52:14.456074 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 01:52:14.456084 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 01:52:14.456090 543705 custom_config.go:64] query custom config with name: gpu
I0321 01:52:14.456664 543705 disk_worker.go:494] system disk:vda1
I0321 01:52:14.456701 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 01:52:15.456865 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 01:52:15.456876 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:52:16.457948 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 01:52:16.457947 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 01:52:16.458007 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:52:16.458027 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:52:16.472361 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:52:23.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:52:23.409812 543705 memory.go:184] no items to output this cycle
I0321 01:52:23.409822 543705 cpu.go:275] no items to output this cycle
I0321 01:52:27.177677 543705 disk_info.go:125] begin check local disk info of client
I0321 01:52:27.180221 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:52:27.180227 543705 disk_info.go:196] parse disk info done, disk is : [0xc00056d280 0xc00056d2c0]
E0321 01:52:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:52:33.409778 543705 memory.go:184] no items to output this cycle
I0321 01:52:33.409810 543705 cpu.go:275] no items to output this cycle
E0321 01:52:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:52:43.409821 543705 memory.go:191] Add success.
I0321 01:52:43.409825 543705 cpu.go:282] Add success.
I0321 01:52:43.419906 543705 net.go:648] Add success.
I0321 01:52:43.422899 543705 net.go:770] primary dev: ETH0
I0321 01:52:43.422914 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:52:43.422928 543705 net.go:698] Add success.
I0321 01:52:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:52:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:52:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:52:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:52:53.409780 543705 memory.go:184] no items to output this cycle
I0321 01:52:53.409797 543705 cpu.go:275] no items to output this cycle
I0321 01:53:03.409884 543705 cpu.go:275] no items to output this cycle
E0321 01:53:03.409885 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:53:03.409903 543705 memory.go:184] no items to output this cycle
E0321 01:53:13.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:53:13.409811 543705 memory.go:191] Add success.
I0321 01:53:13.409814 543705 cpu.go:282] Add success.
W0321 01:53:13.409840 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:53:13.409852 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:53:13.409855 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:53:13.420147 543705 net.go:648] Add success.
I0321 01:53:13.422850 543705 net.go:770] primary dev: ETH0
I0321 01:53:13.422863 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:53:13.422875 543705 net.go:698] Add success.
I0321 01:53:14.454980 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:53:14.455209 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:53:14.455221 543705 disk_worker.go:708] disk space is not compliant
W0321 01:53:14.455224 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:53:14.456644 543705 disk_worker.go:494] system disk:vda1
I0321 01:53:14.456680 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:53:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:53:16.457993 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:53:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:53:16.458078 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:53:16.472398 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:53:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:53:23.409804 543705 memory.go:184] no items to output this cycle
I0321 01:53:23.409814 543705 cpu.go:275] no items to output this cycle
I0321 01:53:27.181673 543705 disk_info.go:125] begin check local disk info of client
I0321 01:53:27.184173 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:53:27.184180 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001faf80 0xc0001fafc0]
E0321 01:53:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:53:33.409765 543705 memory.go:184] no items to output this cycle
I0321 01:53:33.409812 543705 cpu.go:275] no items to output this cycle
E0321 01:53:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:53:43.409822 543705 memory.go:191] Add success.
I0321 01:53:43.409830 543705 cpu.go:282] Add success.
I0321 01:53:43.419989 543705 net.go:648] Add success.
I0321 01:53:43.422910 543705 net.go:770] primary dev: ETH0
I0321 01:53:43.422923 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:53:43.422936 543705 net.go:698] Add success.
I0321 01:53:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:53:46.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:53:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:53:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:53:53.409780 543705 memory.go:184] no items to output this cycle
I0321 01:53:53.409786 543705 cpu.go:275] no items to output this cycle
E0321 01:54:03.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:54:03.409763 543705 memory.go:184] no items to output this cycle
I0321 01:54:03.409801 543705 cpu.go:275] no items to output this cycle
E0321 01:54:13.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:54:13.409791 543705 memory.go:191] Add success.
I0321 01:54:13.409809 543705 cpu.go:282] Add success.
W0321 01:54:13.409817 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:54:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:54:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:54:13.420149 543705 net.go:648] Add success.
I0321 01:54:13.423286 543705 net.go:770] primary dev: ETH0
I0321 01:54:13.423302 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:54:13.423315 543705 net.go:698] Add success.
I0321 01:54:13.468461 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"fa4b2332-f39e-4e1f-9481-ce157e43393d","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 01:54:13.468494 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 01:54:14.454995 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:54:14.455121 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:54:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0321 01:54:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:54:14.456594 543705 disk_worker.go:494] system disk:vda1
I0321 01:54:14.456655 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:54:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:54:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:54:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:54:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:54:16.472408 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:54:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:54:23.409788 543705 memory.go:184] no items to output this cycle
I0321 01:54:23.409792 543705 cpu.go:275] no items to output this cycle
I0321 01:54:27.185674 543705 disk_info.go:125] begin check local disk info of client
I0321 01:54:27.188451 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:54:27.188458 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003b8d80 0xc0003b8dc0]
E0321 01:54:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:54:33.409766 543705 memory.go:184] no items to output this cycle
I0321 01:54:33.409795 543705 cpu.go:275] no items to output this cycle
I0321 01:54:38.691445 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 01:54:38.691452 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 01:54:43.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:54:43.410670 543705 memory.go:191] Add success.
I0321 01:54:43.409796 543705 cpu.go:282] Add success.
I0321 01:54:43.420347 543705 net.go:648] Add success.
I0321 01:54:43.423010 543705 net.go:770] primary dev: ETH0
I0321 01:54:43.423024 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:54:43.423037 543705 net.go:698] Add success.
I0321 01:54:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:54:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:54:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:54:53.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:54:53.409759 543705 memory.go:184] no items to output this cycle
I0321 01:54:53.409793 543705 cpu.go:275] no items to output this cycle
E0321 01:55:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:55:03.409778 543705 memory.go:184] no items to output this cycle
I0321 01:55:03.409782 543705 cpu.go:275] no items to output this cycle
E0321 01:55:13.409987 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:55:13.410011 543705 memory.go:191] Add success.
W0321 01:55:13.410039 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:55:13.410054 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:55:13.410057 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:55:13.409987 543705 cpu.go:282] Add success.
I0321 01:55:13.419704 543705 net.go:648] Add success.
I0321 01:55:13.422709 543705 net.go:770] primary dev: ETH0
I0321 01:55:13.422722 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:55:13.422733 543705 net.go:698] Add success.
I0321 01:55:14.455004 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:55:14.455230 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:55:14.455242 543705 disk_worker.go:708] disk space is not compliant
W0321 01:55:14.455245 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:55:14.456661 543705 disk_worker.go:494] system disk:vda1
I0321 01:55:14.456694 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:55:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:55:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:55:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:55:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:55:16.472395 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:55:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:55:23.409791 543705 memory.go:184] no items to output this cycle
I0321 01:55:23.409824 543705 cpu.go:275] no items to output this cycle
I0321 01:55:27.189684 543705 disk_info.go:125] begin check local disk info of client
I0321 01:55:27.192250 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:55:27.192256 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001faac0 0xc0001fab00]
E0321 01:55:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:55:33.409773 543705 memory.go:184] no items to output this cycle
I0321 01:55:33.409780 543705 cpu.go:275] no items to output this cycle
E0321 01:55:43.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:55:43.409817 543705 memory.go:191] Add success.
I0321 01:55:43.409818 543705 cpu.go:282] Add success.
I0321 01:55:43.419987 543705 net.go:648] Add success.
I0321 01:55:43.422665 543705 net.go:770] primary dev: ETH0
I0321 01:55:43.422682 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:55:43.422697 543705 net.go:698] Add success.
I0321 01:55:46.457985 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:55:46.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:55:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:55:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:55:53.409767 543705 memory.go:184] no items to output this cycle
I0321 01:55:53.409793 543705 cpu.go:275] no items to output this cycle
E0321 01:56:03.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:56:03.409768 543705 memory.go:184] no items to output this cycle
I0321 01:56:03.409785 543705 cpu.go:275] no items to output this cycle
E0321 01:56:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:56:13.409793 543705 memory.go:191] Add success.
I0321 01:56:13.409798 543705 cpu.go:282] Add success.
W0321 01:56:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:56:13.409830 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:56:13.409833 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:56:13.419720 543705 net.go:648] Add success.
I0321 01:56:13.422294 543705 net.go:770] primary dev: ETH0
I0321 01:56:13.422305 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:56:13.422317 543705 net.go:698] Add success.
I0321 01:56:14.455013 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:56:14.455212 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:56:14.455315 543705 disk_worker.go:708] disk space is not compliant
W0321 01:56:14.455320 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:56:14.457160 543705 disk_worker.go:494] system disk:vda1
I0321 01:56:14.457192 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:56:15.455980 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:56:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:56:16.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:56:16.458080 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:56:16.472451 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:56:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:56:23.409789 543705 memory.go:184] no items to output this cycle
I0321 01:56:23.409792 543705 cpu.go:275] no items to output this cycle
I0321 01:56:27.193674 543705 disk_info.go:125] begin check local disk info of client
I0321 01:56:27.196234 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:56:27.196241 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001ab040 0xc0001ab080]
E0321 01:56:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:56:33.409773 543705 memory.go:184] no items to output this cycle
I0321 01:56:33.409781 543705 cpu.go:275] no items to output this cycle
E0321 01:56:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:56:43.409784 543705 memory.go:191] Add success.
I0321 01:56:43.409784 543705 cpu.go:282] Add success.
I0321 01:56:43.419876 543705 net.go:648] Add success.
I0321 01:56:43.423056 543705 net.go:770] primary dev: ETH0
I0321 01:56:43.423071 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:56:43.423085 543705 net.go:698] Add success.
I0321 01:56:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:56:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:56:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:56:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:56:53.409799 543705 memory.go:184] no items to output this cycle
I0321 01:56:53.409810 543705 cpu.go:275] no items to output this cycle
E0321 01:57:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:57:03.409800 543705 memory.go:184] no items to output this cycle
I0321 01:57:03.409815 543705 cpu.go:275] no items to output this cycle
E0321 01:57:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:57:13.409874 543705 memory.go:191] Add success.
W0321 01:57:13.409903 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:57:13.409917 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:57:13.409920 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:57:13.409942 543705 cpu.go:282] Add success.
I0321 01:57:13.419721 543705 net.go:648] Add success.
I0321 01:57:13.422814 543705 net.go:770] primary dev: ETH0
I0321 01:57:13.422826 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:57:13.422838 543705 net.go:698] Add success.
I0321 01:57:13.429444 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0321 01:57:13.453624 543705 event_worker.go:152] Polling the log file for events...
I0321 01:57:13.469475 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a62898ce-8410-43d8-9c59-41ee74b04dc2","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 01:57:13.469507 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0321 01:57:14.455335 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:57:14.455353 543705 disk_worker.go:708] disk space is not compliant
W0321 01:57:14.455358 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:57:14.458199 543705 disk_worker.go:494] system disk:vda1
I0321 01:57:14.458245 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 01:57:14.458530 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 01:57:14.458539 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 01:57:14.458545 543705 custom_config.go:64] query custom config with name: gpu
E0321 01:57:15.457005 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 01:57:15.457019 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:57:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 01:57:16.457981 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 01:57:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:57:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:57:16.472535 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:57:23.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:57:23.409770 543705 memory.go:184] no items to output this cycle
I0321 01:57:23.409790 543705 cpu.go:275] no items to output this cycle
I0321 01:57:27.197676 543705 disk_info.go:125] begin check local disk info of client
I0321 01:57:27.200449 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:57:27.200455 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aafc0 0xc0001ab000]
E0321 01:57:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:57:33.409785 543705 memory.go:184] no items to output this cycle
I0321 01:57:33.409794 543705 cpu.go:275] no items to output this cycle
I0321 01:57:38.691593 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 01:57:38.691599 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 01:57:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:57:43.410727 543705 memory.go:191] Add success.
I0321 01:57:43.409798 543705 cpu.go:282] Add success.
I0321 01:57:43.420466 543705 net.go:648] Add success.
I0321 01:57:43.423714 543705 net.go:770] primary dev: ETH0
I0321 01:57:43.423727 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:57:43.423740 543705 net.go:698] Add success.
I0321 01:57:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:57:46.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:57:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:57:53.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:57:53.409813 543705 memory.go:184] no items to output this cycle
I0321 01:57:53.409824 543705 cpu.go:275] no items to output this cycle
E0321 01:58:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:58:03.409879 543705 cpu.go:275] no items to output this cycle
I0321 01:58:03.409891 543705 memory.go:184] no items to output this cycle
E0321 01:58:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:58:13.409798 543705 memory.go:191] Add success.
I0321 01:58:13.409815 543705 cpu.go:282] Add success.
W0321 01:58:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:58:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:58:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:58:13.420122 543705 net.go:648] Add success.
I0321 01:58:13.423244 543705 net.go:770] primary dev: ETH0
I0321 01:58:13.423258 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:58:13.423269 543705 net.go:698] Add success.
I0321 01:58:14.455118 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:58:14.455116 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:58:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0321 01:58:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:58:14.456524 543705 disk_worker.go:494] system disk:vda1
I0321 01:58:14.456573 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:58:15.455981 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:58:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:58:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:58:16.458065 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:58:16.472468 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:58:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:58:23.409798 543705 memory.go:184] no items to output this cycle
I0321 01:58:23.409799 543705 cpu.go:275] no items to output this cycle
I0321 01:58:27.201674 543705 disk_info.go:125] begin check local disk info of client
I0321 01:58:27.204201 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:58:27.204208 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004d6080 0xc0004d60c0]
E0321 01:58:33.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:58:33.409802 543705 memory.go:184] no items to output this cycle
I0321 01:58:33.409815 543705 cpu.go:275] no items to output this cycle
E0321 01:58:43.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:58:43.409797 543705 memory.go:191] Add success.
I0321 01:58:43.409802 543705 cpu.go:282] Add success.
I0321 01:58:43.419875 543705 net.go:648] Add success.
I0321 01:58:43.422538 543705 net.go:770] primary dev: ETH0
I0321 01:58:43.422550 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:58:43.422561 543705 net.go:698] Add success.
I0321 01:58:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:58:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:58:46.458064 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:58:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:58:53.409799 543705 memory.go:184] no items to output this cycle
I0321 01:58:53.409799 543705 cpu.go:275] no items to output this cycle
E0321 01:59:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:59:03.409807 543705 memory.go:184] no items to output this cycle
I0321 01:59:03.409820 543705 cpu.go:275] no items to output this cycle
E0321 01:59:13.409955 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:59:13.409963 543705 cpu.go:282] Add success.
I0321 01:59:13.409983 543705 memory.go:191] Add success.
W0321 01:59:13.410015 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 01:59:13.410038 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 01:59:13.410043 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 01:59:13.419734 543705 net.go:648] Add success.
I0321 01:59:13.422628 543705 net.go:770] primary dev: ETH0
I0321 01:59:13.422647 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:59:13.422661 543705 net.go:698] Add success.
I0321 01:59:14.453939 543705 custom_config.go:64] query custom config with name: gpu
W0321 01:59:14.455198 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 01:59:14.455209 543705 disk_worker.go:708] disk space is not compliant
W0321 01:59:14.455212 543705 disk_worker.go:728] disk inode is not compliant
I0321 01:59:14.458027 543705 disk_worker.go:494] system disk:vda1
I0321 01:59:14.458059 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 01:59:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 01:59:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:59:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:59:16.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0321 01:59:16.472447 543705 disk_local_worker.go:436] Get disk info: []
E0321 01:59:23.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:59:23.409797 543705 memory.go:184] no items to output this cycle
I0321 01:59:23.409835 543705 cpu.go:275] no items to output this cycle
I0321 01:59:27.205675 543705 disk_info.go:125] begin check local disk info of client
I0321 01:59:27.208165 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 01:59:27.208172 543705 disk_info.go:196] parse disk info done, disk is : [0xc00056d640 0xc00056d680]
E0321 01:59:33.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:59:33.409780 543705 memory.go:184] no items to output this cycle
I0321 01:59:33.409787 543705 cpu.go:275] no items to output this cycle
E0321 01:59:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:59:43.409785 543705 memory.go:191] Add success.
I0321 01:59:43.409819 543705 cpu.go:282] Add success.
I0321 01:59:43.419878 543705 net.go:648] Add success.
I0321 01:59:43.422657 543705 net.go:770] primary dev: ETH0
I0321 01:59:43.422672 543705 net.go:802] Send network stats successfully!,count is 6
I0321 01:59:43.422686 543705 net.go:698] Add success.
I0321 01:59:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 01:59:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 01:59:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0321 01:59:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 01:59:53.409790 543705 memory.go:184] no items to output this cycle
I0321 01:59:53.409790 543705 cpu.go:275] no items to output this cycle
E0321 02:00:03.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:00:03.409801 543705 memory.go:184] no items to output this cycle
I0321 02:00:03.409805 543705 cpu.go:275] no items to output this cycle
E0321 02:00:13.409866 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:00:13.409892 543705 memory.go:191] Add success.
W0321 02:00:13.409920 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:00:13.409936 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:00:13.409939 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:00:13.410032 543705 cpu.go:282] Add success.
I0321 02:00:13.419750 543705 net.go:648] Add success.
I0321 02:00:13.422452 543705 net.go:770] primary dev: ETH0
I0321 02:00:13.422466 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:00:13.422479 543705 net.go:698] Add success.
I0321 02:00:13.556311 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"981959d5-9476-4e94-a32d-e9f90b5f7a4b","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 02:00:13.556341 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 02:00:14.455109 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:00:14.455162 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:00:14.455173 543705 disk_worker.go:708] disk space is not compliant
W0321 02:00:14.455175 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:00:14.456554 543705 disk_worker.go:494] system disk:vda1
I0321 02:00:14.456603 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:00:15.455977 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:00:16.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:00:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:00:16.458071 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:00:16.472450 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:00:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:00:23.409774 543705 memory.go:184] no items to output this cycle
I0321 02:00:23.409806 543705 cpu.go:275] no items to output this cycle
I0321 02:00:27.209672 543705 disk_info.go:125] begin check local disk info of client
I0321 02:00:27.212227 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:00:27.212233 543705 disk_info.go:196] parse disk info done, disk is : [0xc00056d940 0xc00056d980]
E0321 02:00:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:00:33.409797 543705 memory.go:184] no items to output this cycle
I0321 02:00:33.409808 543705 cpu.go:275] no items to output this cycle
I0321 02:00:38.692155 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 02:00:38.692162 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 02:00:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:00:43.410694 543705 memory.go:191] Add success.
I0321 02:00:43.409811 543705 cpu.go:282] Add success.
I0321 02:00:43.420387 543705 net.go:648] Add success.
I0321 02:00:43.423032 543705 net.go:770] primary dev: ETH0
I0321 02:00:43.423045 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:00:43.423059 543705 net.go:698] Add success.
I0321 02:00:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:00:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:00:46.458065 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:00:53.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:00:53.409771 543705 memory.go:184] no items to output this cycle
I0321 02:00:53.409791 543705 cpu.go:275] no items to output this cycle
E0321 02:01:03.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:01:03.409774 543705 memory.go:184] no items to output this cycle
I0321 02:01:03.409795 543705 cpu.go:275] no items to output this cycle
I0321 02:01:13.409962 543705 cpu.go:282] Add success.
E0321 02:01:13.409997 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:01:13.410018 543705 memory.go:191] Add success.
W0321 02:01:13.410050 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:01:13.410062 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:01:13.410066 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:01:13.419722 543705 net.go:648] Add success.
I0321 02:01:13.422506 543705 net.go:770] primary dev: ETH0
I0321 02:01:13.422520 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:01:13.422533 543705 net.go:698] Add success.
I0321 02:01:14.454996 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:01:14.455139 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:01:14.455202 543705 disk_worker.go:708] disk space is not compliant
W0321 02:01:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:01:14.456525 543705 disk_worker.go:494] system disk:vda1
I0321 02:01:14.456572 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:01:15.455982 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:01:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:01:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:01:16.458082 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:01:16.472457 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:01:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:01:23.409800 543705 memory.go:184] no items to output this cycle
I0321 02:01:23.409810 543705 cpu.go:275] no items to output this cycle
I0321 02:01:27.213679 543705 disk_info.go:125] begin check local disk info of client
I0321 02:01:27.216185 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:01:27.216192 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003ad300 0xc0003ad340]
E0321 02:01:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:01:33.409780 543705 memory.go:184] no items to output this cycle
I0321 02:01:33.409785 543705 cpu.go:275] no items to output this cycle
E0321 02:01:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:01:43.409781 543705 memory.go:191] Add success.
I0321 02:01:43.409810 543705 cpu.go:282] Add success.
I0321 02:01:43.420035 543705 net.go:648] Add success.
I0321 02:01:43.422592 543705 net.go:770] primary dev: ETH0
I0321 02:01:43.422605 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:01:43.422617 543705 net.go:698] Add success.
I0321 02:01:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:01:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:01:46.458066 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:01:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:01:53.409767 543705 memory.go:184] no items to output this cycle
I0321 02:01:53.409786 543705 cpu.go:275] no items to output this cycle
E0321 02:02:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:02:03.409782 543705 memory.go:184] no items to output this cycle
I0321 02:02:03.409785 543705 cpu.go:275] no items to output this cycle
E0321 02:02:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:02:13.409794 543705 memory.go:191] Add success.
I0321 02:02:13.409796 543705 cpu.go:282] Add success.
W0321 02:02:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:02:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:02:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:02:13.420123 543705 net.go:648] Add success.
I0321 02:02:13.422824 543705 net.go:770] primary dev: ETH0
I0321 02:02:13.422837 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:02:13.422848 543705 net.go:698] Add success.
W0321 02:02:14.455344 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:02:14.455358 543705 disk_worker.go:708] disk space is not compliant
W0321 02:02:14.455363 543705 disk_worker.go:728] disk inode is not compliant
E0321 02:02:14.457492 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 02:02:14.457499 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 02:02:14.457503 543705 custom_config.go:64] query custom config with name: gpu
I0321 02:02:14.457523 543705 disk_worker.go:494] system disk:vda1
I0321 02:02:14.457566 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 02:02:15.457028 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 02:02:15.457043 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:02:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 02:02:16.457987 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 02:02:16.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:02:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:02:16.472462 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:02:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:02:23.409784 543705 memory.go:184] no items to output this cycle
I0321 02:02:23.409800 543705 cpu.go:275] no items to output this cycle
I0321 02:02:27.217675 543705 disk_info.go:125] begin check local disk info of client
I0321 02:02:27.220340 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:02:27.220349 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b600 0xc00007b640]
E0321 02:02:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:02:33.409774 543705 memory.go:184] no items to output this cycle
I0321 02:02:33.409778 543705 cpu.go:275] no items to output this cycle
E0321 02:02:43.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:02:43.409792 543705 memory.go:191] Add success.
I0321 02:02:43.409792 543705 cpu.go:282] Add success.
I0321 02:02:43.419861 543705 net.go:648] Add success.
I0321 02:02:43.422569 543705 net.go:770] primary dev: ETH0
I0321 02:02:43.422584 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:02:43.422597 543705 net.go:698] Add success.
I0321 02:02:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:02:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:02:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:02:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:02:53.409763 543705 memory.go:184] no items to output this cycle
I0321 02:02:53.409797 543705 cpu.go:275] no items to output this cycle
E0321 02:03:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:03:03.409781 543705 memory.go:184] no items to output this cycle
I0321 02:03:03.409788 543705 cpu.go:275] no items to output this cycle
E0321 02:03:13.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:03:13.409783 543705 memory.go:191] Add success.
I0321 02:03:13.409811 543705 cpu.go:282] Add success.
W0321 02:03:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:03:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:03:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:03:13.420094 543705 net.go:648] Add success.
I0321 02:03:13.422735 543705 net.go:770] primary dev: ETH0
I0321 02:03:13.422750 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:03:13.422764 543705 net.go:698] Add success.
I0321 02:03:13.469545 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"489421b6-9d36-47fb-9924-d67b29148d09","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 02:03:13.469580 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 02:03:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:03:14.455190 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:03:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0321 02:03:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:03:14.456731 543705 disk_worker.go:494] system disk:vda1
I0321 02:03:14.456762 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:03:15.455638 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:03:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:03:16.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:03:16.458080 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:03:16.472481 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:03:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:03:23.409781 543705 memory.go:184] no items to output this cycle
I0321 02:03:23.409787 543705 cpu.go:275] no items to output this cycle
I0321 02:03:27.221676 543705 disk_info.go:125] begin check local disk info of client
I0321 02:03:27.224458 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:03:27.224464 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bc100 0xc0002bc140]
E0321 02:03:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:03:33.409776 543705 memory.go:184] no items to output this cycle
I0321 02:03:33.409782 543705 cpu.go:275] no items to output this cycle
I0321 02:03:38.692299 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 02:03:38.692305 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 02:03:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:03:43.410722 543705 memory.go:191] Add success.
I0321 02:03:43.409794 543705 cpu.go:282] Add success.
I0321 02:03:43.420294 543705 net.go:770] primary dev: ETH0
I0321 02:03:43.420309 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:03:43.420324 543705 net.go:698] Add success.
I0321 02:03:43.420679 543705 net.go:648] Add success.
I0321 02:03:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:03:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:03:46.458084 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:03:53.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:03:53.409802 543705 memory.go:184] no items to output this cycle
I0321 02:03:53.409815 543705 cpu.go:275] no items to output this cycle
E0321 02:04:03.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:04:03.409807 543705 memory.go:184] no items to output this cycle
I0321 02:04:03.409826 543705 cpu.go:275] no items to output this cycle
E0321 02:04:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:04:13.409901 543705 memory.go:191] Add success.
W0321 02:04:13.409931 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:04:13.409944 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:04:13.409949 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:04:13.409990 543705 cpu.go:282] Add success.
I0321 02:04:13.419754 543705 net.go:648] Add success.
I0321 02:04:13.422528 543705 net.go:770] primary dev: ETH0
I0321 02:04:13.422543 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:04:13.422556 543705 net.go:698] Add success.
I0321 02:04:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:04:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:04:14.455201 543705 disk_worker.go:708] disk space is not compliant
W0321 02:04:14.455205 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:04:14.456582 543705 disk_worker.go:494] system disk:vda1
I0321 02:04:14.456611 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:04:15.455980 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:04:16.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:04:16.458060 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:04:16.458088 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:04:16.472488 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:04:23.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:04:23.409809 543705 memory.go:184] no items to output this cycle
I0321 02:04:23.409818 543705 cpu.go:275] no items to output this cycle
I0321 02:04:27.225682 543705 disk_info.go:125] begin check local disk info of client
I0321 02:04:27.228251 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:04:27.228258 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aabc0 0xc0001aac00]
E0321 02:04:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:04:33.409804 543705 memory.go:184] no items to output this cycle
I0321 02:04:33.409807 543705 cpu.go:275] no items to output this cycle
E0321 02:04:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:04:43.409778 543705 memory.go:191] Add success.
I0321 02:04:43.409804 543705 cpu.go:282] Add success.
I0321 02:04:43.420013 543705 net.go:648] Add success.
I0321 02:04:43.423009 543705 net.go:770] primary dev: ETH0
I0321 02:04:43.423022 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:04:43.423036 543705 net.go:698] Add success.
I0321 02:04:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:04:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:04:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:04:53.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:04:53.409764 543705 memory.go:184] no items to output this cycle
I0321 02:04:53.409798 543705 cpu.go:275] no items to output this cycle
E0321 02:05:03.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:05:03.409778 543705 memory.go:184] no items to output this cycle
I0321 02:05:03.409799 543705 cpu.go:275] no items to output this cycle
E0321 02:05:13.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:05:13.409894 543705 memory.go:191] Add success.
W0321 02:05:13.409931 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0321 02:05:13.409939 543705 cpu.go:282] Add success.
W0321 02:05:13.409949 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:05:13.409989 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:05:13.419716 543705 net.go:648] Add success.
I0321 02:05:13.422346 543705 net.go:770] primary dev: ETH0
I0321 02:05:13.422359 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:05:13.422371 543705 net.go:698] Add success.
I0321 02:05:14.454962 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:05:14.455210 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:05:14.455221 543705 disk_worker.go:708] disk space is not compliant
W0321 02:05:14.455224 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:05:14.456571 543705 disk_worker.go:494] system disk:vda1
I0321 02:05:14.456600 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:05:15.455982 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:05:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:05:16.458047 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:05:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:05:16.472456 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:05:23.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:05:23.409772 543705 memory.go:184] no items to output this cycle
I0321 02:05:23.409802 543705 cpu.go:275] no items to output this cycle
I0321 02:05:27.229676 543705 disk_info.go:125] begin check local disk info of client
I0321 02:05:27.232175 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:05:27.232181 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e4580 0xc0003e45c0]
E0321 02:05:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:05:33.409779 543705 memory.go:184] no items to output this cycle
I0321 02:05:33.409784 543705 cpu.go:275] no items to output this cycle
E0321 02:05:43.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:05:43.409813 543705 memory.go:191] Add success.
I0321 02:05:43.409819 543705 cpu.go:282] Add success.
I0321 02:05:43.419719 543705 net.go:770] primary dev: ETH0
I0321 02:05:43.419734 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:05:43.419747 543705 net.go:698] Add success.
I0321 02:05:43.420090 543705 net.go:648] Add success.
I0321 02:05:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:05:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:05:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:05:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:05:53.409772 543705 memory.go:184] no items to output this cycle
I0321 02:05:53.409782 543705 cpu.go:275] no items to output this cycle
E0321 02:06:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:06:03.409809 543705 memory.go:184] no items to output this cycle
I0321 02:06:03.409820 543705 cpu.go:275] no items to output this cycle
E0321 02:06:13.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:06:13.409868 543705 memory.go:191] Add success.
W0321 02:06:13.409900 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:06:13.409913 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:06:13.409916 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:06:13.409925 543705 cpu.go:282] Add success.
I0321 02:06:13.419711 543705 net.go:648] Add success.
I0321 02:06:13.422702 543705 net.go:770] primary dev: ETH0
I0321 02:06:13.422715 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:06:13.422726 543705 net.go:698] Add success.
I0321 02:06:13.463974 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"34967508-a74f-4d8c-afad-3a5950389716","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 02:06:13.464008 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 02:06:14.454955 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:06:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:06:14.455208 543705 disk_worker.go:708] disk space is not compliant
W0321 02:06:14.455211 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:06:14.456687 543705 disk_worker.go:494] system disk:vda1
I0321 02:06:14.456723 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:06:15.455983 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:06:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:06:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:06:16.458082 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:06:16.472461 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:06:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:06:23.409808 543705 memory.go:184] no items to output this cycle
I0321 02:06:23.409820 543705 cpu.go:275] no items to output this cycle
I0321 02:06:27.233674 543705 disk_info.go:125] begin check local disk info of client
I0321 02:06:27.236471 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:06:27.236478 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bd640 0xc0002bd680]
E0321 02:06:33.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:06:33.409779 543705 cpu.go:275] no items to output this cycle
I0321 02:06:33.409784 543705 memory.go:184] no items to output this cycle
I0321 02:06:38.693153 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 02:06:38.693162 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 02:06:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:06:43.410633 543705 memory.go:191] Add success.
I0321 02:06:43.409792 543705 cpu.go:282] Add success.
I0321 02:06:43.420405 543705 net.go:648] Add success.
I0321 02:06:43.422989 543705 net.go:770] primary dev: ETH0
I0321 02:06:43.423002 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:06:43.423015 543705 net.go:698] Add success.
I0321 02:06:46.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:06:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:06:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:06:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:06:53.409790 543705 memory.go:184] no items to output this cycle
I0321 02:06:53.409810 543705 cpu.go:275] no items to output this cycle
E0321 02:07:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:07:03.409783 543705 memory.go:184] no items to output this cycle
I0321 02:07:03.409791 543705 cpu.go:275] no items to output this cycle
E0321 02:07:13.409877 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:07:13.409903 543705 memory.go:191] Add success.
W0321 02:07:13.409932 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:07:13.409949 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:07:13.409948 543705 cpu.go:282] Add success.
I0321 02:07:13.409953 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:07:13.419708 543705 net.go:648] Add success.
I0321 02:07:13.422320 543705 net.go:770] primary dev: ETH0
I0321 02:07:13.422333 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:07:13.422344 543705 net.go:698] Add success.
I0321 02:07:13.452769 543705 event_worker.go:152] Polling the log file for events...
W0321 02:07:14.455158 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:07:14.455168 543705 disk_worker.go:708] disk space is not compliant
W0321 02:07:14.455171 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:07:14.456782 543705 disk_worker.go:494] system disk:vda1
I0321 02:07:14.456820 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 02:07:14.457083 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 02:07:14.457091 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 02:07:14.457095 543705 custom_config.go:64] query custom config with name: gpu
E0321 02:07:15.456986 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 02:07:15.457000 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:07:16.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 02:07:16.457986 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 02:07:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:07:16.458064 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:07:16.472436 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:07:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:07:23.409795 543705 memory.go:184] no items to output this cycle
I0321 02:07:23.409805 543705 cpu.go:275] no items to output this cycle
I0321 02:07:27.237674 543705 disk_info.go:125] begin check local disk info of client
I0321 02:07:27.240166 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:07:27.240173 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bd6c0 0xc0002bd700]
E0321 02:07:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:07:33.409767 543705 memory.go:184] no items to output this cycle
I0321 02:07:33.409797 543705 cpu.go:275] no items to output this cycle
E0321 02:07:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:07:43.409790 543705 memory.go:191] Add success.
I0321 02:07:43.409806 543705 cpu.go:282] Add success.
I0321 02:07:43.419964 543705 net.go:648] Add success.
I0321 02:07:43.422744 543705 net.go:770] primary dev: ETH0
I0321 02:07:43.422759 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:07:43.422773 543705 net.go:698] Add success.
I0321 02:07:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:07:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:07:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:07:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:07:53.409794 543705 memory.go:184] no items to output this cycle
I0321 02:07:53.409803 543705 cpu.go:275] no items to output this cycle
E0321 02:08:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:08:03.409772 543705 memory.go:184] no items to output this cycle
I0321 02:08:03.409791 543705 cpu.go:275] no items to output this cycle
E0321 02:08:13.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:08:13.409903 543705 memory.go:191] Add success.
I0321 02:08:13.409931 543705 cpu.go:282] Add success.
W0321 02:08:13.409940 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:08:13.409953 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:08:13.409956 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:08:13.419722 543705 net.go:648] Add success.
I0321 02:08:13.422331 543705 net.go:770] primary dev: ETH0
I0321 02:08:13.422343 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:08:13.422354 543705 net.go:698] Add success.
I0321 02:08:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:08:14.455179 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:08:14.455190 543705 disk_worker.go:708] disk space is not compliant
W0321 02:08:14.455193 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:08:14.456821 543705 disk_worker.go:494] system disk:vda1
I0321 02:08:14.456851 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:08:15.455979 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:08:16.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:08:16.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:08:16.458087 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:08:16.472500 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:08:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:08:23.409789 543705 memory.go:184] no items to output this cycle
I0321 02:08:23.409793 543705 cpu.go:275] no items to output this cycle
I0321 02:08:27.241677 543705 disk_info.go:125] begin check local disk info of client
I0321 02:08:27.244168 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:08:27.244174 543705 disk_info.go:196] parse disk info done, disk is : [0xc00053ae80 0xc00053aec0]
E0321 02:08:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:08:33.409792 543705 memory.go:184] no items to output this cycle
I0321 02:08:33.409807 543705 cpu.go:275] no items to output this cycle
E0321 02:08:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:08:43.409816 543705 memory.go:191] Add success.
I0321 02:08:43.409816 543705 cpu.go:282] Add success.
I0321 02:08:43.420461 543705 net.go:648] Add success.
I0321 02:08:43.423074 543705 net.go:770] primary dev: ETH0
I0321 02:08:43.423086 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:08:43.423099 543705 net.go:698] Add success.
I0321 02:08:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:08:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:08:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:08:53.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:08:53.409799 543705 memory.go:184] no items to output this cycle
I0321 02:08:53.409808 543705 cpu.go:275] no items to output this cycle
E0321 02:09:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:09:03.409774 543705 memory.go:184] no items to output this cycle
I0321 02:09:03.409796 543705 cpu.go:275] no items to output this cycle
E0321 02:09:13.409862 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:09:13.409888 543705 memory.go:191] Add success.
W0321 02:09:13.409919 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0321 02:09:13.409928 543705 cpu.go:282] Add success.
W0321 02:09:13.409931 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:09:13.409938 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:09:13.419709 543705 net.go:648] Add success.
I0321 02:09:13.422222 543705 net.go:770] primary dev: ETH0
I0321 02:09:13.422236 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:09:13.422247 543705 net.go:698] Add success.
I0321 02:09:13.468295 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"78e139b5-c408-4bda-977c-0f696e63791a","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 02:09:13.468340 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 02:09:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:09:14.455144 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:09:14.455227 543705 disk_worker.go:708] disk space is not compliant
W0321 02:09:14.455230 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:09:14.456765 543705 disk_worker.go:494] system disk:vda1
I0321 02:09:14.456793 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:09:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:09:16.457999 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:09:16.458068 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:09:16.458095 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:09:16.472455 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:09:23.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:09:23.409787 543705 memory.go:184] no items to output this cycle
I0321 02:09:23.409792 543705 cpu.go:275] no items to output this cycle
I0321 02:09:27.245673 543705 disk_info.go:125] begin check local disk info of client
I0321 02:09:27.248177 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:09:27.248183 543705 disk_info.go:196] parse disk info done, disk is : [0xc00053b640 0xc00053b680]
E0321 02:09:33.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:09:33.409767 543705 memory.go:184] no items to output this cycle
I0321 02:09:33.409807 543705 cpu.go:275] no items to output this cycle
I0321 02:09:38.693740 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 02:09:38.693747 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 02:09:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:09:43.410568 543705 memory.go:191] Add success.
I0321 02:09:43.409813 543705 cpu.go:282] Add success.
I0321 02:09:43.420301 543705 net.go:648] Add success.
I0321 02:09:43.422838 543705 net.go:770] primary dev: ETH0
I0321 02:09:43.422852 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:09:43.422865 543705 net.go:698] Add success.
I0321 02:09:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:09:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:09:46.458072 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:09:53.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:09:53.409813 543705 memory.go:184] no items to output this cycle
I0321 02:09:53.409822 543705 cpu.go:275] no items to output this cycle
E0321 02:10:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:10:03.409798 543705 memory.go:184] no items to output this cycle
I0321 02:10:03.409810 543705 cpu.go:275] no items to output this cycle
E0321 02:10:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:10:13.409862 543705 memory.go:191] Add success.
W0321 02:10:13.409892 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:10:13.409904 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:10:13.409907 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:10:13.409926 543705 cpu.go:282] Add success.
I0321 02:10:13.419709 543705 net.go:648] Add success.
I0321 02:10:13.422293 543705 net.go:770] primary dev: ETH0
I0321 02:10:13.422308 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:10:13.422322 543705 net.go:698] Add success.
I0321 02:10:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:10:14.455150 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:10:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0321 02:10:14.455165 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:10:14.456497 543705 disk_worker.go:494] system disk:vda1
I0321 02:10:14.456536 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:10:15.455972 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:10:16.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:10:16.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:10:16.458075 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:10:16.472462 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:10:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:10:23.409783 543705 memory.go:184] no items to output this cycle
I0321 02:10:23.409806 543705 cpu.go:275] no items to output this cycle
I0321 02:10:27.249677 543705 disk_info.go:125] begin check local disk info of client
I0321 02:10:27.252219 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:10:27.252225 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5840 0xc0000c5880]
E0321 02:10:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:10:33.409778 543705 memory.go:184] no items to output this cycle
I0321 02:10:33.409782 543705 cpu.go:275] no items to output this cycle
E0321 02:10:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:10:43.409787 543705 cpu.go:282] Add success.
I0321 02:10:43.409797 543705 memory.go:191] Add success.
I0321 02:10:43.420053 543705 net.go:648] Add success.
I0321 02:10:43.422786 543705 net.go:770] primary dev: ETH0
I0321 02:10:43.422799 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:10:43.422812 543705 net.go:698] Add success.
I0321 02:10:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:10:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:10:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:10:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:10:53.409769 543705 memory.go:184] no items to output this cycle
I0321 02:10:53.409798 543705 cpu.go:275] no items to output this cycle
E0321 02:11:03.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:11:03.409783 543705 memory.go:184] no items to output this cycle
I0321 02:11:03.409788 543705 cpu.go:275] no items to output this cycle
E0321 02:11:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:11:13.409788 543705 memory.go:191] Add success.
I0321 02:11:13.409802 543705 cpu.go:282] Add success.
W0321 02:11:13.409814 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:11:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:11:13.409828 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:11:13.419723 543705 net.go:648] Add success.
I0321 02:11:13.422699 543705 net.go:770] primary dev: ETH0
I0321 02:11:13.422712 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:11:13.422724 543705 net.go:698] Add success.
I0321 02:11:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:11:14.455202 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:11:14.455212 543705 disk_worker.go:708] disk space is not compliant
W0321 02:11:14.455215 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:11:14.456612 543705 disk_worker.go:494] system disk:vda1
I0321 02:11:14.456642 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:11:15.455986 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:11:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:11:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:11:16.458072 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:11:16.472466 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:11:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:11:23.409798 543705 memory.go:184] no items to output this cycle
I0321 02:11:23.409810 543705 cpu.go:275] no items to output this cycle
I0321 02:11:27.253672 543705 disk_info.go:125] begin check local disk info of client
I0321 02:11:27.256158 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:11:27.256165 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5e80 0xc0000c5ec0]
E0321 02:11:33.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:11:33.409773 543705 memory.go:184] no items to output this cycle
I0321 02:11:33.409781 543705 cpu.go:275] no items to output this cycle
E0321 02:11:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:11:43.409787 543705 cpu.go:282] Add success.
I0321 02:11:43.409792 543705 memory.go:191] Add success.
I0321 02:11:43.419858 543705 net.go:648] Add success.
I0321 02:11:43.422559 543705 net.go:770] primary dev: ETH0
I0321 02:11:43.422571 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:11:43.422584 543705 net.go:698] Add success.
I0321 02:11:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:11:46.458051 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:11:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:11:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:11:53.409783 543705 memory.go:184] no items to output this cycle
I0321 02:11:53.409784 543705 cpu.go:275] no items to output this cycle
E0321 02:12:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:12:03.409795 543705 memory.go:184] no items to output this cycle
I0321 02:12:03.409807 543705 cpu.go:275] no items to output this cycle
E0321 02:12:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:12:13.409784 543705 memory.go:191] Add success.
I0321 02:12:13.409785 543705 cpu.go:282] Add success.
W0321 02:12:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:12:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:12:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:12:13.419728 543705 net.go:648] Add success.
I0321 02:12:13.422378 543705 net.go:770] primary dev: ETH0
I0321 02:12:13.422392 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:12:13.422402 543705 net.go:698] Add success.
I0321 02:12:13.469327 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"a2e2fd30-22a0-48ef-9a10-10c76d443c54","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 02:12:13.469360 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0321 02:12:14.455252 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:12:14.455270 543705 disk_worker.go:708] disk space is not compliant
W0321 02:12:14.455275 543705 disk_worker.go:728] disk inode is not compliant
E0321 02:12:14.456230 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 02:12:14.456242 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 02:12:14.456248 543705 custom_config.go:64] query custom config with name: gpu
I0321 02:12:14.456903 543705 disk_worker.go:494] system disk:vda1
I0321 02:12:14.456954 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 02:12:15.457143 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 02:12:15.457161 543705 custom_config.go:64] query custom config with name: huawei_npu
E0321 02:12:16.458036 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 02:12:16.458036 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:12:16.458094 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:12:16.458114 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:12:16.472480 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:12:23.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:12:23.409791 543705 memory.go:184] no items to output this cycle
I0321 02:12:23.409793 543705 cpu.go:275] no items to output this cycle
I0321 02:12:27.257726 543705 disk_info.go:125] begin check local disk info of client
I0321 02:12:27.260269 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:12:27.260275 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb5c0 0xc0001fb600]
E0321 02:12:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:12:33.409795 543705 memory.go:184] no items to output this cycle
I0321 02:12:33.409809 543705 cpu.go:275] no items to output this cycle
I0321 02:12:38.693891 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 02:12:38.693898 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 02:12:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:12:43.410636 543705 memory.go:191] Add success.
I0321 02:12:43.409822 543705 cpu.go:282] Add success.
I0321 02:12:43.420323 543705 net.go:648] Add success.
I0321 02:12:43.422836 543705 net.go:770] primary dev: ETH0
I0321 02:12:43.422850 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:12:43.422862 543705 net.go:698] Add success.
I0321 02:12:46.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:12:46.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:12:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:12:53.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:12:53.409801 543705 memory.go:184] no items to output this cycle
I0321 02:12:53.409809 543705 cpu.go:275] no items to output this cycle
E0321 02:13:03.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:13:03.409798 543705 memory.go:184] no items to output this cycle
I0321 02:13:03.409813 543705 cpu.go:275] no items to output this cycle
E0321 02:13:13.409886 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:13:13.409948 543705 memory.go:191] Add success.
I0321 02:13:13.409973 543705 cpu.go:282] Add success.
W0321 02:13:13.409977 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:13:13.409989 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:13:13.409992 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:13:13.419734 543705 net.go:648] Add success.
I0321 02:13:13.422446 543705 net.go:770] primary dev: ETH0
I0321 02:13:13.422460 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:13:13.422474 543705 net.go:698] Add success.
I0321 02:13:14.454975 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:13:14.455124 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:13:14.455204 543705 disk_worker.go:708] disk space is not compliant
W0321 02:13:14.455207 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:13:14.456597 543705 disk_worker.go:494] system disk:vda1
I0321 02:13:14.456627 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:13:15.455980 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:13:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:13:16.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:13:16.458088 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:13:16.472497 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:13:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:13:23.409797 543705 memory.go:184] no items to output this cycle
I0321 02:13:23.409812 543705 cpu.go:275] no items to output this cycle
I0321 02:13:27.261677 543705 disk_info.go:125] begin check local disk info of client
I0321 02:13:27.264205 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:13:27.264212 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bce40 0xc0002bce80]
E0321 02:13:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:13:33.409797 543705 memory.go:184] no items to output this cycle
I0321 02:13:33.409811 543705 cpu.go:275] no items to output this cycle
E0321 02:13:43.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:13:43.409831 543705 memory.go:191] Add success.
I0321 02:13:43.409832 543705 cpu.go:282] Add success.
I0321 02:13:43.419890 543705 net.go:648] Add success.
I0321 02:13:43.422368 543705 net.go:770] primary dev: ETH0
I0321 02:13:43.422384 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:13:43.422399 543705 net.go:698] Add success.
I0321 02:13:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:13:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:13:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:13:53.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:13:53.409812 543705 memory.go:184] no items to output this cycle
I0321 02:13:53.409819 543705 cpu.go:275] no items to output this cycle
E0321 02:14:03.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:14:03.409764 543705 memory.go:184] no items to output this cycle
I0321 02:14:03.409794 543705 cpu.go:275] no items to output this cycle
E0321 02:14:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:14:13.409785 543705 memory.go:191] Add success.
I0321 02:14:13.409788 543705 cpu.go:282] Add success.
W0321 02:14:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:14:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:14:13.409826 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:14:13.420389 543705 net.go:648] Add success.
I0321 02:14:13.423294 543705 net.go:770] primary dev: ETH0
I0321 02:14:13.423308 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:14:13.423319 543705 net.go:698] Add success.
I0321 02:14:14.454947 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:14:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:14:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0321 02:14:14.455204 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:14:14.456595 543705 disk_worker.go:494] system disk:vda1
I0321 02:14:14.456623 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:14:15.455978 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:14:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:14:16.458071 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:14:16.458099 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:14:16.472471 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:14:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:14:23.409781 543705 memory.go:184] no items to output this cycle
I0321 02:14:23.409805 543705 cpu.go:275] no items to output this cycle
I0321 02:14:27.265678 543705 disk_info.go:125] begin check local disk info of client
I0321 02:14:27.268235 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:14:27.268243 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bcb80 0xc0002bcbc0]
E0321 02:14:33.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:14:33.409766 543705 memory.go:184] no items to output this cycle
I0321 02:14:33.409797 543705 cpu.go:275] no items to output this cycle
E0321 02:14:43.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:14:43.409808 543705 memory.go:191] Add success.
I0321 02:14:43.409819 543705 cpu.go:282] Add success.
I0321 02:14:43.419704 543705 net.go:770] primary dev: ETH0
I0321 02:14:43.419719 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:14:43.419734 543705 net.go:698] Add success.
I0321 02:14:43.420095 543705 net.go:648] Add success.
I0321 02:14:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:14:46.458058 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:14:46.458083 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:14:53.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:14:53.409782 543705 memory.go:184] no items to output this cycle
I0321 02:14:53.409788 543705 cpu.go:275] no items to output this cycle
E0321 02:15:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:15:03.409775 543705 memory.go:184] no items to output this cycle
I0321 02:15:03.409779 543705 cpu.go:275] no items to output this cycle
E0321 02:15:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:15:13.409900 543705 memory.go:191] Add success.
I0321 02:15:13.409926 543705 cpu.go:282] Add success.
W0321 02:15:13.409934 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:15:13.409947 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:15:13.409950 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:15:13.419736 543705 net.go:648] Add success.
I0321 02:15:13.422904 543705 net.go:770] primary dev: ETH0
I0321 02:15:13.422917 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:15:13.422929 543705 net.go:698] Add success.
I0321 02:15:13.477090 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"1ee8f3d0-7d11-4e3b-a8ea-552a6fa246a9","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 02:15:13.477121 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 02:15:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:15:14.455201 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:15:14.455211 543705 disk_worker.go:708] disk space is not compliant
W0321 02:15:14.455214 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:15:14.456792 543705 disk_worker.go:494] system disk:vda1
I0321 02:15:14.456820 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:15:15.456001 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:15:16.458000 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:15:16.458084 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:15:16.458116 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:15:16.472498 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:15:23.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:15:23.409778 543705 memory.go:184] no items to output this cycle
I0321 02:15:23.409782 543705 cpu.go:275] no items to output this cycle
I0321 02:15:27.269675 543705 disk_info.go:125] begin check local disk info of client
I0321 02:15:27.272189 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:15:27.272195 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034b740 0xc00034b780]
E0321 02:15:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:15:33.409765 543705 memory.go:184] no items to output this cycle
I0321 02:15:33.409804 543705 cpu.go:275] no items to output this cycle
I0321 02:15:38.695172 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 02:15:38.695179 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 02:15:43.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:15:43.410744 543705 memory.go:191] Add success.
I0321 02:15:43.409812 543705 cpu.go:282] Add success.
I0321 02:15:43.420441 543705 net.go:648] Add success.
I0321 02:15:43.423070 543705 net.go:770] primary dev: ETH0
I0321 02:15:43.423094 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:15:43.423109 543705 net.go:698] Add success.
I0321 02:15:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:15:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:15:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:15:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:15:53.409788 543705 memory.go:184] no items to output this cycle
I0321 02:15:53.409789 543705 cpu.go:275] no items to output this cycle
E0321 02:16:03.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:16:03.409764 543705 memory.go:184] no items to output this cycle
I0321 02:16:03.409801 543705 cpu.go:275] no items to output this cycle
E0321 02:16:13.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:16:13.409791 543705 memory.go:191] Add success.
I0321 02:16:13.409793 543705 cpu.go:282] Add success.
W0321 02:16:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:16:13.409829 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:16:13.409832 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:16:13.420047 543705 net.go:648] Add success.
I0321 02:16:13.423075 543705 net.go:770] primary dev: ETH0
I0321 02:16:13.423089 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:16:13.423111 543705 net.go:698] Add success.
I0321 02:16:14.454961 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:16:14.455117 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:16:14.455182 543705 disk_worker.go:708] disk space is not compliant
W0321 02:16:14.455185 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:16:14.456518 543705 disk_worker.go:494] system disk:vda1
I0321 02:16:14.456560 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:16:15.456010 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:16:16.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:16:16.458056 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:16:16.458082 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:16:16.472497 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:16:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:16:23.409794 543705 memory.go:184] no items to output this cycle
I0321 02:16:23.409796 543705 cpu.go:275] no items to output this cycle
I0321 02:16:27.273674 543705 disk_info.go:125] begin check local disk info of client
I0321 02:16:27.276228 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:16:27.276233 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039a880 0xc00039a8c0]
E0321 02:16:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:16:33.409773 543705 memory.go:184] no items to output this cycle
I0321 02:16:33.409776 543705 cpu.go:275] no items to output this cycle
E0321 02:16:43.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:16:43.409787 543705 memory.go:191] Add success.
I0321 02:16:43.409791 543705 cpu.go:282] Add success.
I0321 02:16:43.420068 543705 net.go:648] Add success.
I0321 02:16:43.422954 543705 net.go:770] primary dev: ETH0
I0321 02:16:43.422969 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:16:43.422985 543705 net.go:698] Add success.
I0321 02:16:46.457971 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:16:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:16:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:16:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:16:53.409802 543705 memory.go:184] no items to output this cycle
I0321 02:16:53.409811 543705 cpu.go:275] no items to output this cycle
E0321 02:17:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:17:03.409786 543705 memory.go:184] no items to output this cycle
I0321 02:17:03.409792 543705 cpu.go:275] no items to output this cycle
E0321 02:17:13.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:17:13.409776 543705 memory.go:191] Add success.
W0321 02:17:13.409801 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0321 02:17:13.409801 543705 cpu.go:282] Add success.
W0321 02:17:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:17:13.409816 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:17:13.420134 543705 net.go:648] Add success.
I0321 02:17:13.422705 543705 net.go:770] primary dev: ETH0
I0321 02:17:13.422720 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:17:13.422734 543705 net.go:698] Add success.
I0321 02:17:13.453287 543705 event_worker.go:152] Polling the log file for events...
W0321 02:17:14.455165 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:17:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0321 02:17:14.455177 543705 disk_worker.go:728] disk inode is not compliant
E0321 02:17:14.456937 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 02:17:14.456946 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 02:17:14.456953 543705 custom_config.go:64] query custom config with name: gpu
I0321 02:17:14.456999 543705 disk_worker.go:494] system disk:vda1
I0321 02:17:14.457040 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 02:17:15.456920 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 02:17:15.456931 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:17:16.457623 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 02:17:16.457623 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 02:17:16.457699 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:17:16.457723 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:17:16.472039 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:17:23.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:17:23.409782 543705 memory.go:184] no items to output this cycle
I0321 02:17:23.409784 543705 cpu.go:275] no items to output this cycle
I0321 02:17:27.277677 543705 disk_info.go:125] begin check local disk info of client
I0321 02:17:27.280446 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:17:27.280454 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001faf80 0xc0001fafc0]
E0321 02:17:33.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:17:33.409767 543705 memory.go:184] no items to output this cycle
I0321 02:17:33.409797 543705 cpu.go:275] no items to output this cycle
E0321 02:17:43.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:17:43.409813 543705 memory.go:191] Add success.
I0321 02:17:43.409818 543705 cpu.go:282] Add success.
I0321 02:17:43.419920 543705 net.go:648] Add success.
I0321 02:17:43.422493 543705 net.go:770] primary dev: ETH0
I0321 02:17:43.422507 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:17:43.422518 543705 net.go:698] Add success.
I0321 02:17:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:17:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:17:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:17:53.409876 543705 cpu.go:275] no items to output this cycle
E0321 02:17:53.409949 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:17:53.409959 543705 memory.go:184] no items to output this cycle
E0321 02:18:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:18:03.409804 543705 memory.go:184] no items to output this cycle
I0321 02:18:03.409819 543705 cpu.go:275] no items to output this cycle
E0321 02:18:13.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:18:13.409777 543705 memory.go:191] Add success.
I0321 02:18:13.409799 543705 cpu.go:282] Add success.
W0321 02:18:13.409802 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:18:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:18:13.409816 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:18:13.420071 543705 net.go:770] primary dev: ETH0
I0321 02:18:13.420084 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:18:13.420096 543705 net.go:698] Add success.
I0321 02:18:13.420467 543705 net.go:648] Add success.
I0321 02:18:13.471548 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"28daf914-6b02-4b83-9956-107e454fa4ea","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 02:18:13.471582 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 02:18:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:18:14.455115 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:18:14.455200 543705 disk_worker.go:708] disk space is not compliant
W0321 02:18:14.455203 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:18:14.456574 543705 disk_worker.go:494] system disk:vda1
I0321 02:18:14.456604 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:18:15.455971 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:18:16.458004 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:18:16.458077 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:18:16.458106 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:18:16.472489 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:18:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:18:23.409788 543705 memory.go:184] no items to output this cycle
I0321 02:18:23.409810 543705 cpu.go:275] no items to output this cycle
I0321 02:18:27.281673 543705 disk_info.go:125] begin check local disk info of client
I0321 02:18:27.284322 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:18:27.284328 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003e8b00 0xc0003e8b40]
E0321 02:18:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:18:33.409794 543705 memory.go:184] no items to output this cycle
I0321 02:18:33.409806 543705 cpu.go:275] no items to output this cycle
I0321 02:18:38.695314 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 02:18:38.695320 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 02:18:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:18:43.410757 543705 memory.go:191] Add success.
I0321 02:18:43.409814 543705 cpu.go:282] Add success.
I0321 02:18:43.420546 543705 net.go:648] Add success.
I0321 02:18:43.423186 543705 net.go:770] primary dev: ETH0
I0321 02:18:43.423199 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:18:43.423211 543705 net.go:698] Add success.
I0321 02:18:46.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:18:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:18:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:18:53.409811 543705 cpu.go:275] no items to output this cycle
E0321 02:18:53.409813 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:18:53.409832 543705 memory.go:184] no items to output this cycle
E0321 02:19:03.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:19:03.409785 543705 memory.go:184] no items to output this cycle
I0321 02:19:03.409808 543705 cpu.go:275] no items to output this cycle
E0321 02:19:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:19:13.409793 543705 memory.go:191] Add success.
I0321 02:19:13.409797 543705 cpu.go:282] Add success.
W0321 02:19:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:19:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:19:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:19:13.420124 543705 net.go:648] Add success.
I0321 02:19:13.422583 543705 net.go:770] primary dev: ETH0
I0321 02:19:13.422598 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:19:13.422609 543705 net.go:698] Add success.
I0321 02:19:14.454953 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:19:14.455119 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:19:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0321 02:19:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:19:14.456582 543705 disk_worker.go:494] system disk:vda1
I0321 02:19:14.456611 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:19:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:19:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:19:16.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:19:16.458087 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:19:16.472463 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:19:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:19:23.409776 543705 memory.go:184] no items to output this cycle
I0321 02:19:23.409808 543705 cpu.go:275] no items to output this cycle
I0321 02:19:27.285675 543705 disk_info.go:125] begin check local disk info of client
I0321 02:19:27.288491 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:19:27.288497 543705 disk_info.go:196] parse disk info done, disk is : [0xc00037e540 0xc00037e580]
E0321 02:19:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:19:33.409782 543705 memory.go:184] no items to output this cycle
I0321 02:19:33.409796 543705 cpu.go:275] no items to output this cycle
E0321 02:19:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:19:43.409789 543705 memory.go:191] Add success.
I0321 02:19:43.409797 543705 cpu.go:282] Add success.
I0321 02:19:43.419991 543705 net.go:648] Add success.
I0321 02:19:43.422831 543705 net.go:770] primary dev: ETH0
I0321 02:19:43.422847 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:19:43.422861 543705 net.go:698] Add success.
I0321 02:19:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:19:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:19:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:19:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:19:53.409889 543705 cpu.go:275] no items to output this cycle
I0321 02:19:53.409892 543705 memory.go:184] no items to output this cycle
E0321 02:20:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:20:03.409779 543705 memory.go:184] no items to output this cycle
I0321 02:20:03.409792 543705 cpu.go:275] no items to output this cycle
E0321 02:20:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:20:13.409814 543705 memory.go:191] Add success.
I0321 02:20:13.409818 543705 cpu.go:282] Add success.
W0321 02:20:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:20:13.409861 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:20:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:20:13.420518 543705 net.go:648] Add success.
I0321 02:20:13.423322 543705 net.go:770] primary dev: ETH0
I0321 02:20:13.423337 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:20:13.423351 543705 net.go:698] Add success.
I0321 02:20:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:20:14.455184 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:20:14.455195 543705 disk_worker.go:708] disk space is not compliant
W0321 02:20:14.455198 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:20:14.456578 543705 disk_worker.go:494] system disk:vda1
I0321 02:20:14.456607 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:20:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:20:16.457991 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:20:16.458071 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:20:16.458098 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:20:16.472483 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:20:23.409790 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:20:23.409811 543705 memory.go:184] no items to output this cycle
I0321 02:20:23.409820 543705 cpu.go:275] no items to output this cycle
I0321 02:20:27.289674 543705 disk_info.go:125] begin check local disk info of client
I0321 02:20:27.292462 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:20:27.292470 543705 disk_info.go:196] parse disk info done, disk is : [0xc000374000 0xc000374040]
E0321 02:20:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:20:33.409794 543705 memory.go:184] no items to output this cycle
I0321 02:20:33.409810 543705 cpu.go:275] no items to output this cycle
E0321 02:20:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:20:43.409790 543705 memory.go:191] Add success.
I0321 02:20:43.409809 543705 cpu.go:282] Add success.
I0321 02:20:43.419867 543705 net.go:648] Add success.
I0321 02:20:43.422326 543705 net.go:770] primary dev: ETH0
I0321 02:20:43.422342 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:20:43.422357 543705 net.go:698] Add success.
I0321 02:20:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:20:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:20:46.458081 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:20:53.409881 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:20:53.409887 543705 cpu.go:275] no items to output this cycle
I0321 02:20:53.409920 543705 memory.go:184] no items to output this cycle
E0321 02:21:03.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:21:03.409797 543705 memory.go:184] no items to output this cycle
I0321 02:21:03.409813 543705 cpu.go:275] no items to output this cycle
E0321 02:21:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:21:13.409793 543705 memory.go:191] Add success.
I0321 02:21:13.409796 543705 cpu.go:282] Add success.
W0321 02:21:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:21:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:21:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:21:13.420062 543705 net.go:648] Add success.
I0321 02:21:13.422712 543705 net.go:770] primary dev: ETH0
I0321 02:21:13.422726 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:21:13.422738 543705 net.go:698] Add success.
I0321 02:21:13.468649 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"7188f136-c721-4fcd-ace9-e763dea26ef7","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 02:21:13.468681 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 02:21:14.454965 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:21:14.455182 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:21:14.455192 543705 disk_worker.go:708] disk space is not compliant
W0321 02:21:14.455195 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:21:14.456616 543705 disk_worker.go:494] system disk:vda1
I0321 02:21:14.456646 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:21:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:21:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:21:16.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:21:16.458084 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:21:16.472473 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:21:23.410197 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:21:23.410217 543705 memory.go:184] no items to output this cycle
I0321 02:21:23.410232 543705 cpu.go:275] no items to output this cycle
I0321 02:21:27.293680 543705 disk_info.go:125] begin check local disk info of client
I0321 02:21:27.296198 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:21:27.296205 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fbe40 0xc0001fbe80]
E0321 02:21:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:21:33.409795 543705 memory.go:184] no items to output this cycle
I0321 02:21:33.409811 543705 cpu.go:275] no items to output this cycle
I0321 02:21:38.695460 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 02:21:38.695467 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 02:21:43.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:21:43.410682 543705 memory.go:191] Add success.
I0321 02:21:43.409802 543705 cpu.go:282] Add success.
I0321 02:21:43.420458 543705 net.go:648] Add success.
I0321 02:21:43.423251 543705 net.go:770] primary dev: ETH0
I0321 02:21:43.423266 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:21:43.423280 543705 net.go:698] Add success.
I0321 02:21:46.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:21:46.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:21:46.458068 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:21:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:21:53.409782 543705 memory.go:184] no items to output this cycle
I0321 02:21:53.409799 543705 cpu.go:275] no items to output this cycle
E0321 02:22:03.409853 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:22:03.409870 543705 memory.go:184] no items to output this cycle
I0321 02:22:03.409940 543705 cpu.go:275] no items to output this cycle
E0321 02:22:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:22:13.409785 543705 memory.go:191] Add success.
W0321 02:22:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0321 02:22:13.409819 543705 cpu.go:282] Add success.
W0321 02:22:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:22:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:22:13.420102 543705 net.go:648] Add success.
I0321 02:22:13.422786 543705 net.go:770] primary dev: ETH0
I0321 02:22:13.422800 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:22:13.422814 543705 net.go:698] Add success.
W0321 02:22:14.455168 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:22:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0321 02:22:14.455182 543705 disk_worker.go:728] disk inode is not compliant
E0321 02:22:14.456910 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 02:22:14.456920 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 02:22:14.456926 543705 custom_config.go:64] query custom config with name: gpu
I0321 02:22:14.456999 543705 disk_worker.go:494] system disk:vda1
I0321 02:22:14.457041 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 02:22:15.456820 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 02:22:15.456828 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:22:16.458100 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 02:22:16.458130 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 02:22:16.458161 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:22:16.458181 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:22:16.472553 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:22:23.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:22:23.409788 543705 memory.go:184] no items to output this cycle
I0321 02:22:23.409817 543705 cpu.go:275] no items to output this cycle
I0321 02:22:27.297684 543705 disk_info.go:125] begin check local disk info of client
I0321 02:22:27.300209 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:22:27.300217 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb840 0xc0001fb880]
E0321 02:22:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:22:33.409799 543705 memory.go:184] no items to output this cycle
I0321 02:22:33.409815 543705 cpu.go:275] no items to output this cycle
E0321 02:22:43.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:22:43.409825 543705 memory.go:191] Add success.
I0321 02:22:43.409829 543705 cpu.go:282] Add success.
I0321 02:22:43.419982 543705 net.go:648] Add success.
I0321 02:22:43.422537 543705 net.go:770] primary dev: ETH0
I0321 02:22:43.422551 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:22:43.422562 543705 net.go:698] Add success.
I0321 02:22:46.457994 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:22:46.458070 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:22:46.458095 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:22:53.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:22:53.409787 543705 memory.go:184] no items to output this cycle
I0321 02:22:53.409803 543705 cpu.go:275] no items to output this cycle
E0321 02:23:03.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:23:03.409811 543705 memory.go:184] no items to output this cycle
I0321 02:23:03.409824 543705 cpu.go:275] no items to output this cycle
E0321 02:23:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:23:13.409788 543705 memory.go:191] Add success.
I0321 02:23:13.409807 543705 cpu.go:282] Add success.
W0321 02:23:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:23:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:23:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:23:13.420102 543705 net.go:648] Add success.
I0321 02:23:13.422880 543705 net.go:770] primary dev: ETH0
I0321 02:23:13.422892 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:23:13.422904 543705 net.go:698] Add success.
I0321 02:23:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:23:14.455156 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:23:14.455167 543705 disk_worker.go:708] disk space is not compliant
W0321 02:23:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:23:14.456515 543705 disk_worker.go:494] system disk:vda1
I0321 02:23:14.456559 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:23:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:23:16.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:23:16.458066 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:23:16.458094 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:23:16.472522 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:23:23.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:23:23.409791 543705 memory.go:184] no items to output this cycle
I0321 02:23:23.409792 543705 cpu.go:275] no items to output this cycle
I0321 02:23:27.301676 543705 disk_info.go:125] begin check local disk info of client
I0321 02:23:27.304166 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:23:27.304172 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa380 0xc0001fa3c0]
E0321 02:23:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:23:33.409790 543705 memory.go:184] no items to output this cycle
I0321 02:23:33.409796 543705 cpu.go:275] no items to output this cycle
E0321 02:23:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:23:43.409795 543705 cpu.go:282] Add success.
I0321 02:23:43.409796 543705 memory.go:191] Add success.
I0321 02:23:43.419963 543705 net.go:648] Add success.
I0321 02:23:43.422793 543705 net.go:770] primary dev: ETH0
I0321 02:23:43.422812 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:23:43.422832 543705 net.go:698] Add success.
I0321 02:23:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:23:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:23:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:23:53.409798 543705 cpu.go:275] no items to output this cycle
E0321 02:23:53.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:23:53.409815 543705 memory.go:184] no items to output this cycle
E0321 02:24:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:24:03.409787 543705 memory.go:184] no items to output this cycle
I0321 02:24:03.409807 543705 cpu.go:275] no items to output this cycle
E0321 02:24:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:24:13.409805 543705 memory.go:191] Add success.
I0321 02:24:13.409808 543705 cpu.go:282] Add success.
W0321 02:24:13.409835 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:24:13.409846 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:24:13.409849 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:24:13.420048 543705 net.go:648] Add success.
I0321 02:24:13.422494 543705 net.go:770] primary dev: ETH0
I0321 02:24:13.422507 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:24:13.422519 543705 net.go:698] Add success.
I0321 02:24:13.468919 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"450b326d-07bc-49c6-8505-77846e886d91","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 02:24:13.468951 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 02:24:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:24:14.455112 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:24:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0321 02:24:14.455190 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:24:14.456692 543705 disk_worker.go:494] system disk:vda1
I0321 02:24:14.456729 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:24:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:24:16.457989 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:24:16.458062 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:24:16.458089 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:24:16.472508 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:24:23.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:24:23.409803 543705 memory.go:184] no items to output this cycle
I0321 02:24:23.409807 543705 cpu.go:275] no items to output this cycle
I0321 02:24:27.305676 543705 disk_info.go:125] begin check local disk info of client
I0321 02:24:27.308260 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:24:27.308266 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fa140 0xc0001fa180]
E0321 02:24:33.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:24:33.409785 543705 memory.go:184] no items to output this cycle
I0321 02:24:33.409793 543705 cpu.go:275] no items to output this cycle
I0321 02:24:38.695607 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 02:24:38.695613 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 02:24:43.409791 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:24:43.410727 543705 memory.go:191] Add success.
I0321 02:24:43.409835 543705 cpu.go:282] Add success.
I0321 02:24:43.420416 543705 net.go:648] Add success.
I0321 02:24:43.423280 543705 net.go:770] primary dev: ETH0
I0321 02:24:43.423293 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:24:43.423305 543705 net.go:698] Add success.
I0321 02:24:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:24:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:24:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:24:53.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:24:53.409807 543705 memory.go:184] no items to output this cycle
I0321 02:24:53.409819 543705 cpu.go:275] no items to output this cycle
E0321 02:25:03.409868 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:25:03.409883 543705 cpu.go:275] no items to output this cycle
I0321 02:25:03.409887 543705 memory.go:184] no items to output this cycle
E0321 02:25:13.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:25:13.409769 543705 memory.go:191] Add success.
W0321 02:25:13.409805 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:25:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:25:13.409820 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:25:13.409833 543705 cpu.go:282] Add success.
I0321 02:25:13.420041 543705 net.go:648] Add success.
I0321 02:25:13.423013 543705 net.go:770] primary dev: ETH0
I0321 02:25:13.423026 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:25:13.423037 543705 net.go:698] Add success.
I0321 02:25:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:25:14.455174 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:25:14.455184 543705 disk_worker.go:708] disk space is not compliant
W0321 02:25:14.455187 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:25:14.456770 543705 disk_worker.go:494] system disk:vda1
I0321 02:25:14.456811 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:25:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:25:16.457994 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:25:16.458064 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:25:16.458091 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:25:16.472472 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:25:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:25:23.409780 543705 memory.go:184] no items to output this cycle
I0321 02:25:23.409784 543705 cpu.go:275] no items to output this cycle
I0321 02:25:27.309676 543705 disk_info.go:125] begin check local disk info of client
I0321 02:25:27.312180 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:25:27.312186 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fad00 0xc0001fad40]
E0321 02:25:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:25:33.409777 543705 memory.go:184] no items to output this cycle
I0321 02:25:33.409783 543705 cpu.go:275] no items to output this cycle
E0321 02:25:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:25:43.409790 543705 memory.go:191] Add success.
I0321 02:25:43.409791 543705 cpu.go:282] Add success.
I0321 02:25:43.419950 543705 net.go:648] Add success.
I0321 02:25:43.422736 543705 net.go:770] primary dev: ETH0
I0321 02:25:43.422749 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:25:43.422760 543705 net.go:698] Add success.
I0321 02:25:46.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:25:46.458030 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:25:46.458053 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:25:53.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:25:53.409774 543705 memory.go:184] no items to output this cycle
I0321 02:25:53.409777 543705 cpu.go:275] no items to output this cycle
E0321 02:26:03.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:26:03.409772 543705 memory.go:184] no items to output this cycle
I0321 02:26:03.409792 543705 cpu.go:275] no items to output this cycle
E0321 02:26:13.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:26:13.409812 543705 memory.go:191] Add success.
I0321 02:26:13.409820 543705 cpu.go:282] Add success.
W0321 02:26:13.409851 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:26:13.409867 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:26:13.409872 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:26:13.420156 543705 net.go:648] Add success.
I0321 02:26:13.422880 543705 net.go:770] primary dev: ETH0
I0321 02:26:13.422899 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:26:13.422915 543705 net.go:698] Add success.
I0321 02:26:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:26:14.455132 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:26:14.455214 543705 disk_worker.go:708] disk space is not compliant
W0321 02:26:14.455218 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:26:14.456592 543705 disk_worker.go:494] system disk:vda1
I0321 02:26:14.456623 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:26:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:26:16.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:26:16.458074 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:26:16.458099 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:26:16.472484 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:26:23.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:26:23.409779 543705 memory.go:184] no items to output this cycle
I0321 02:26:23.409813 543705 cpu.go:275] no items to output this cycle
I0321 02:26:27.313678 543705 disk_info.go:125] begin check local disk info of client
I0321 02:26:27.316227 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:26:27.316233 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fbb80 0xc0001fbbc0]
E0321 02:26:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:26:33.409772 543705 memory.go:184] no items to output this cycle
I0321 02:26:33.409791 543705 cpu.go:275] no items to output this cycle
E0321 02:26:43.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:26:43.409816 543705 memory.go:191] Add success.
I0321 02:26:43.409820 543705 cpu.go:282] Add success.
I0321 02:26:43.419969 543705 net.go:648] Add success.
I0321 02:26:43.422447 543705 net.go:770] primary dev: ETH0
I0321 02:26:43.422460 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:26:43.422474 543705 net.go:698] Add success.
I0321 02:26:46.457978 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:26:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:26:46.458077 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:26:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:26:53.409772 543705 memory.go:184] no items to output this cycle
I0321 02:26:53.409776 543705 cpu.go:275] no items to output this cycle
E0321 02:27:03.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:27:03.409770 543705 memory.go:184] no items to output this cycle
I0321 02:27:03.409791 543705 cpu.go:275] no items to output this cycle
E0321 02:27:13.409853 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:27:13.409877 543705 memory.go:191] Add success.
W0321 02:27:13.409916 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:27:13.409929 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:27:13.409937 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:27:13.409942 543705 cpu.go:282] Add success.
I0321 02:27:13.425366 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0321 02:27:13.425609 543705 net.go:648] Add success.
I0321 02:27:13.428295 543705 net.go:770] primary dev: ETH0
I0321 02:27:13.428307 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:27:13.428318 543705 net.go:698] Add success.
I0321 02:27:13.452785 543705 event_worker.go:152] Polling the log file for events...
I0321 02:27:13.463425 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"44aa6401-8cf7-4f5e-a92b-c307a49fe7e4","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 02:27:13.463456 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0321 02:27:14.455119 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:27:14.455181 543705 disk_worker.go:708] disk space is not compliant
W0321 02:27:14.455184 543705 disk_worker.go:728] disk inode is not compliant
E0321 02:27:14.456757 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 02:27:14.456766 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 02:27:14.456771 543705 custom_config.go:64] query custom config with name: gpu
I0321 02:27:14.456814 543705 disk_worker.go:494] system disk:vda1
I0321 02:27:14.456840 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 02:27:15.456796 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 02:27:15.456804 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:27:16.458099 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 02:27:16.458129 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 02:27:16.458164 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:27:16.458185 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:27:16.472557 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:27:23.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:27:23.409797 543705 memory.go:184] no items to output this cycle
I0321 02:27:23.409806 543705 cpu.go:275] no items to output this cycle
I0321 02:27:27.317679 543705 disk_info.go:125] begin check local disk info of client
I0321 02:27:27.320190 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:27:27.320196 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001fb040 0xc0001fb080]
E0321 02:27:33.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:27:33.409793 543705 memory.go:184] no items to output this cycle
I0321 02:27:33.409811 543705 cpu.go:275] no items to output this cycle
I0321 02:27:38.695752 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 02:27:38.695759 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 02:27:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:27:43.410567 543705 memory.go:191] Add success.
I0321 02:27:43.409789 543705 cpu.go:282] Add success.
I0321 02:27:43.420359 543705 net.go:648] Add success.
I0321 02:27:43.422989 543705 net.go:770] primary dev: ETH0
I0321 02:27:43.423002 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:27:43.423014 543705 net.go:698] Add success.
I0321 02:27:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:27:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:27:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:27:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:27:53.409771 543705 memory.go:184] no items to output this cycle
I0321 02:27:53.409793 543705 cpu.go:275] no items to output this cycle
E0321 02:28:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:28:03.409777 543705 memory.go:184] no items to output this cycle
I0321 02:28:03.409880 543705 cpu.go:275] no items to output this cycle
E0321 02:28:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:28:13.409798 543705 memory.go:191] Add success.
I0321 02:28:13.409806 543705 cpu.go:282] Add success.
W0321 02:28:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:28:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:28:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:28:13.420154 543705 net.go:648] Add success.
I0321 02:28:13.422930 543705 net.go:770] primary dev: ETH0
I0321 02:28:13.422955 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:28:13.422969 543705 net.go:698] Add success.
I0321 02:28:14.454976 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:28:14.455196 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:28:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0321 02:28:14.455209 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:28:14.456587 543705 disk_worker.go:494] system disk:vda1
I0321 02:28:14.456616 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:28:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:28:16.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:28:16.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:28:16.458085 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:28:16.472512 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:28:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:28:23.409784 543705 memory.go:184] no items to output this cycle
I0321 02:28:23.409804 543705 cpu.go:275] no items to output this cycle
I0321 02:28:27.321680 543705 disk_info.go:125] begin check local disk info of client
I0321 02:28:27.324233 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:28:27.324246 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b1c0 0xc00007b200]
E0321 02:28:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:28:33.409793 543705 memory.go:184] no items to output this cycle
I0321 02:28:33.409808 543705 cpu.go:275] no items to output this cycle
E0321 02:28:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:28:43.409786 543705 memory.go:191] Add success.
I0321 02:28:43.409800 543705 cpu.go:282] Add success.
I0321 02:28:43.419878 543705 net.go:648] Add success.
I0321 02:28:43.422523 543705 net.go:770] primary dev: ETH0
I0321 02:28:43.422536 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:28:43.422548 543705 net.go:698] Add success.
I0321 02:28:46.457996 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:28:46.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:28:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:28:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:28:53.409794 543705 memory.go:184] no items to output this cycle
I0321 02:28:53.409803 543705 cpu.go:275] no items to output this cycle
E0321 02:29:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:29:03.409780 543705 memory.go:184] no items to output this cycle
I0321 02:29:03.409781 543705 cpu.go:275] no items to output this cycle
E0321 02:29:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:29:13.409818 543705 memory.go:191] Add success.
I0321 02:29:13.409827 543705 cpu.go:282] Add success.
W0321 02:29:13.409850 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:29:13.409865 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:29:13.409869 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:29:13.420189 543705 net.go:648] Add success.
I0321 02:29:13.422882 543705 net.go:770] primary dev: ETH0
I0321 02:29:13.422897 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:29:13.422910 543705 net.go:698] Add success.
I0321 02:29:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:29:14.455177 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:29:14.455187 543705 disk_worker.go:708] disk space is not compliant
W0321 02:29:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:29:14.456586 543705 disk_worker.go:494] system disk:vda1
I0321 02:29:14.456616 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:29:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:29:16.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:29:16.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:29:16.458082 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:29:16.472576 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:29:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:29:23.409782 543705 memory.go:184] no items to output this cycle
I0321 02:29:23.409784 543705 cpu.go:275] no items to output this cycle
I0321 02:29:27.325679 543705 disk_info.go:125] begin check local disk info of client
I0321 02:29:27.328180 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:29:27.328186 543705 disk_info.go:196] parse disk info done, disk is : [0xc00053bc40 0xc00053bc80]
E0321 02:29:33.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:29:33.409790 543705 memory.go:184] no items to output this cycle
I0321 02:29:33.409805 543705 cpu.go:275] no items to output this cycle
E0321 02:29:43.409753 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:29:43.409779 543705 memory.go:191] Add success.
I0321 02:29:43.409793 543705 cpu.go:282] Add success.
I0321 02:29:43.419971 543705 net.go:648] Add success.
I0321 02:29:43.420868 543705 net.go:770] primary dev: ETH0
I0321 02:29:43.420882 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:29:43.420896 543705 net.go:698] Add success.
I0321 02:29:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:29:46.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:29:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:29:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:29:53.409772 543705 memory.go:184] no items to output this cycle
I0321 02:29:53.409794 543705 cpu.go:275] no items to output this cycle
E0321 02:30:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:30:03.409768 543705 memory.go:184] no items to output this cycle
I0321 02:30:03.409791 543705 cpu.go:275] no items to output this cycle
E0321 02:30:13.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:30:13.409897 543705 cpu.go:282] Add success.
I0321 02:30:13.409939 543705 memory.go:191] Add success.
W0321 02:30:13.409972 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:30:13.409985 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:30:13.409988 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:30:13.419705 543705 net.go:648] Add success.
I0321 02:30:13.422493 543705 net.go:770] primary dev: ETH0
I0321 02:30:13.422506 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:30:13.422517 543705 net.go:698] Add success.
I0321 02:30:13.468805 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8c2d434d-ca99-4cd0-8bab-9717e3660acf","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 02:30:13.468839 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 02:30:14.454960 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:30:14.455119 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:30:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0321 02:30:14.455201 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:30:14.456692 543705 disk_worker.go:494] system disk:vda1
I0321 02:30:14.456728 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:30:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:30:16.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:30:16.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:30:16.458090 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:30:16.472547 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:30:23.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:30:23.409814 543705 memory.go:184] no items to output this cycle
I0321 02:30:23.409825 543705 cpu.go:275] no items to output this cycle
I0321 02:30:27.329674 543705 disk_info.go:125] begin check local disk info of client
I0321 02:30:27.332300 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:30:27.332306 543705 disk_info.go:196] parse disk info done, disk is : [0xc00053ac00 0xc00053ac40]
E0321 02:30:33.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:30:33.409776 543705 memory.go:184] no items to output this cycle
I0321 02:30:33.409796 543705 cpu.go:275] no items to output this cycle
I0321 02:30:38.697189 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 02:30:38.697196 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 02:30:43.409743 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:30:43.410735 543705 memory.go:191] Add success.
I0321 02:30:43.409811 543705 cpu.go:282] Add success.
I0321 02:30:43.420430 543705 net.go:648] Add success.
I0321 02:30:43.423002 543705 net.go:770] primary dev: ETH0
I0321 02:30:43.423015 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:30:43.423028 543705 net.go:698] Add success.
I0321 02:30:46.457969 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:30:46.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:30:46.458061 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:30:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:30:53.409789 543705 memory.go:184] no items to output this cycle
I0321 02:30:53.409803 543705 cpu.go:275] no items to output this cycle
E0321 02:31:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:31:03.409778 543705 cpu.go:275] no items to output this cycle
I0321 02:31:03.409782 543705 memory.go:184] no items to output this cycle
E0321 02:31:13.409896 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:31:13.409913 543705 cpu.go:282] Add success.
I0321 02:31:13.409924 543705 memory.go:191] Add success.
W0321 02:31:13.409957 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:31:13.409970 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:31:13.409976 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:31:13.419737 543705 net.go:648] Add success.
I0321 02:31:13.422930 543705 net.go:770] primary dev: ETH0
I0321 02:31:13.422944 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:31:13.422957 543705 net.go:698] Add success.
I0321 02:31:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:31:14.455160 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:31:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0321 02:31:14.455174 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:31:14.456564 543705 disk_worker.go:494] system disk:vda1
I0321 02:31:14.456593 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:31:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:31:16.457990 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:31:16.458057 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:31:16.458084 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:31:16.472542 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:31:23.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:31:23.409789 543705 memory.go:184] no items to output this cycle
I0321 02:31:23.409803 543705 cpu.go:275] no items to output this cycle
I0321 02:31:27.333676 543705 disk_info.go:125] begin check local disk info of client
I0321 02:31:27.336161 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:31:27.336167 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bd200 0xc0002bd240]
E0321 02:31:33.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:31:33.409781 543705 memory.go:184] no items to output this cycle
I0321 02:31:33.409785 543705 cpu.go:275] no items to output this cycle
E0321 02:31:43.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:31:43.409796 543705 memory.go:191] Add success.
I0321 02:31:43.409797 543705 cpu.go:282] Add success.
I0321 02:31:43.419835 543705 net.go:648] Add success.
I0321 02:31:43.422318 543705 net.go:770] primary dev: ETH0
I0321 02:31:43.422330 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:31:43.422343 543705 net.go:698] Add success.
I0321 02:31:46.457996 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:31:46.458062 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:31:46.458092 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:31:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:31:53.409786 543705 memory.go:184] no items to output this cycle
I0321 02:31:53.409786 543705 cpu.go:275] no items to output this cycle
E0321 02:32:03.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:32:03.409801 543705 memory.go:184] no items to output this cycle
I0321 02:32:03.409815 543705 cpu.go:275] no items to output this cycle
E0321 02:32:13.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:32:13.409898 543705 memory.go:191] Add success.
W0321 02:32:13.409929 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:32:13.409946 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:32:13.409950 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:32:13.409977 543705 cpu.go:282] Add success.
I0321 02:32:13.419706 543705 net.go:648] Add success.
I0321 02:32:13.422399 543705 net.go:770] primary dev: ETH0
I0321 02:32:13.422417 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:32:13.422435 543705 net.go:698] Add success.
W0321 02:32:14.455162 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:32:14.455172 543705 disk_worker.go:708] disk space is not compliant
W0321 02:32:14.455175 543705 disk_worker.go:728] disk inode is not compliant
E0321 02:32:14.455913 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 02:32:14.455922 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 02:32:14.455928 543705 custom_config.go:64] query custom config with name: gpu
I0321 02:32:14.456552 543705 disk_worker.go:494] system disk:vda1
I0321 02:32:14.456581 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 02:32:15.456854 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 02:32:15.456862 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:32:16.458027 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 02:32:16.458027 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 02:32:16.458099 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:32:16.458125 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:32:16.472535 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:32:23.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:32:23.409815 543705 memory.go:184] no items to output this cycle
I0321 02:32:23.409824 543705 cpu.go:275] no items to output this cycle
I0321 02:32:27.337676 543705 disk_info.go:125] begin check local disk info of client
I0321 02:32:27.340223 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:32:27.340232 543705 disk_info.go:196] parse disk info done, disk is : [0xc00039bc00 0xc00039bc40]
E0321 02:32:33.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:32:33.409770 543705 memory.go:184] no items to output this cycle
I0321 02:32:33.409815 543705 cpu.go:275] no items to output this cycle
E0321 02:32:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:32:43.409809 543705 memory.go:191] Add success.
I0321 02:32:43.409818 543705 cpu.go:282] Add success.
I0321 02:32:43.419954 543705 net.go:648] Add success.
I0321 02:32:43.422747 543705 net.go:770] primary dev: ETH0
I0321 02:32:43.422762 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:32:43.422775 543705 net.go:698] Add success.
I0321 02:32:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:32:46.458048 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:32:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:32:53.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:32:53.409781 543705 cpu.go:275] no items to output this cycle
I0321 02:32:53.409785 543705 memory.go:184] no items to output this cycle
E0321 02:33:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:33:03.409911 543705 memory.go:184] no items to output this cycle
I0321 02:33:03.409913 543705 cpu.go:275] no items to output this cycle
E0321 02:33:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:33:13.409785 543705 memory.go:191] Add success.
I0321 02:33:13.409807 543705 cpu.go:282] Add success.
W0321 02:33:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:33:13.409820 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:33:13.409823 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:33:13.420115 543705 net.go:648] Add success.
I0321 02:33:13.423129 543705 net.go:770] primary dev: ETH0
I0321 02:33:13.423142 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:33:13.423153 543705 net.go:698] Add success.
I0321 02:33:13.543974 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"11ab8216-d221-4c75-92ba-9252d23041cd","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 02:33:13.544006 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 02:33:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:33:14.455151 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:33:14.455162 543705 disk_worker.go:708] disk space is not compliant
W0321 02:33:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:33:14.456537 543705 disk_worker.go:494] system disk:vda1
I0321 02:33:14.456591 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:33:15.455968 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:33:16.457988 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:33:16.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:33:16.458081 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:33:16.472482 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:33:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:33:23.409779 543705 memory.go:184] no items to output this cycle
I0321 02:33:23.409795 543705 cpu.go:275] no items to output this cycle
I0321 02:33:27.341676 543705 disk_info.go:125] begin check local disk info of client
I0321 02:33:27.344198 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:33:27.344205 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aaf00 0xc0001aaf40]
E0321 02:33:33.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:33:33.409780 543705 memory.go:184] no items to output this cycle
I0321 02:33:33.409800 543705 cpu.go:275] no items to output this cycle
I0321 02:33:38.697732 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 02:33:38.697738 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 02:33:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:33:43.410773 543705 memory.go:191] Add success.
I0321 02:33:43.409792 543705 cpu.go:282] Add success.
I0321 02:33:43.420500 543705 net.go:648] Add success.
I0321 02:33:43.423679 543705 net.go:770] primary dev: ETH0
I0321 02:33:43.423694 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:33:43.423709 543705 net.go:698] Add success.
I0321 02:33:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:33:46.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:33:46.458052 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:33:53.409894 543705 cpu.go:275] no items to output this cycle
E0321 02:33:53.409975 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:33:53.409988 543705 memory.go:184] no items to output this cycle
E0321 02:34:03.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:34:03.409783 543705 memory.go:184] no items to output this cycle
I0321 02:34:03.409795 543705 cpu.go:275] no items to output this cycle
E0321 02:34:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:34:13.409804 543705 memory.go:191] Add success.
I0321 02:34:13.409811 543705 cpu.go:282] Add success.
W0321 02:34:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:34:13.409848 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:34:13.409852 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:34:13.420168 543705 net.go:648] Add success.
I0321 02:34:13.423313 543705 net.go:770] primary dev: ETH0
I0321 02:34:13.423327 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:34:13.423340 543705 net.go:698] Add success.
I0321 02:34:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:34:14.455159 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:34:14.455169 543705 disk_worker.go:708] disk space is not compliant
W0321 02:34:14.455172 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:34:14.456530 543705 disk_worker.go:494] system disk:vda1
I0321 02:34:14.456568 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:34:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:34:16.457973 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:34:16.458033 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:34:16.458055 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:34:16.472447 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:34:23.409797 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:34:23.409819 543705 memory.go:184] no items to output this cycle
I0321 02:34:23.409835 543705 cpu.go:275] no items to output this cycle
I0321 02:34:27.345677 543705 disk_info.go:125] begin check local disk info of client
I0321 02:34:27.348245 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:34:27.348251 543705 disk_info.go:196] parse disk info done, disk is : [0xc000513540 0xc000513580]
E0321 02:34:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:34:33.409802 543705 memory.go:184] no items to output this cycle
I0321 02:34:33.409820 543705 cpu.go:275] no items to output this cycle
E0321 02:34:43.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:34:43.409819 543705 memory.go:191] Add success.
I0321 02:34:43.409825 543705 cpu.go:282] Add success.
I0321 02:34:43.419904 543705 net.go:648] Add success.
I0321 02:34:43.423096 543705 net.go:770] primary dev: ETH0
I0321 02:34:43.423110 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:34:43.423122 543705 net.go:698] Add success.
I0321 02:34:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:34:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:34:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:34:53.409868 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:34:53.409890 543705 memory.go:184] no items to output this cycle
I0321 02:34:53.409967 543705 cpu.go:275] no items to output this cycle
E0321 02:35:03.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:35:03.409777 543705 memory.go:184] no items to output this cycle
I0321 02:35:03.409787 543705 cpu.go:275] no items to output this cycle
E0321 02:35:13.409748 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:35:13.409776 543705 memory.go:191] Add success.
W0321 02:35:13.409801 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:35:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:35:13.409814 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:35:13.409814 543705 cpu.go:282] Add success.
I0321 02:35:13.420221 543705 net.go:648] Add success.
I0321 02:35:13.422856 543705 net.go:770] primary dev: ETH0
I0321 02:35:13.422870 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:35:13.422884 543705 net.go:698] Add success.
I0321 02:35:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:35:14.455098 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:35:14.455161 543705 disk_worker.go:708] disk space is not compliant
W0321 02:35:14.455164 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:35:14.456481 543705 disk_worker.go:494] system disk:vda1
I0321 02:35:14.456527 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:35:15.455966 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:35:16.457982 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:35:16.458036 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:35:16.458058 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:35:16.472528 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:35:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:35:23.409782 543705 memory.go:184] no items to output this cycle
I0321 02:35:23.409813 543705 cpu.go:275] no items to output this cycle
I0321 02:35:27.349676 543705 disk_info.go:125] begin check local disk info of client
I0321 02:35:27.352172 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:35:27.352178 543705 disk_info.go:196] parse disk info done, disk is : [0xc000513540 0xc000513580]
E0321 02:35:33.409793 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:35:33.409815 543705 memory.go:184] no items to output this cycle
I0321 02:35:33.409830 543705 cpu.go:275] no items to output this cycle
E0321 02:35:43.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:35:43.409782 543705 memory.go:191] Add success.
I0321 02:35:43.409806 543705 cpu.go:282] Add success.
I0321 02:35:43.419861 543705 net.go:648] Add success.
I0321 02:35:43.422336 543705 net.go:770] primary dev: ETH0
I0321 02:35:43.422351 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:35:43.422365 543705 net.go:698] Add success.
I0321 02:35:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:35:46.458042 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:35:46.458069 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:35:53.410687 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:35:53.410706 543705 memory.go:184] no items to output this cycle
I0321 02:35:53.410717 543705 cpu.go:275] no items to output this cycle
E0321 02:36:03.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:36:03.409787 543705 memory.go:184] no items to output this cycle
I0321 02:36:03.409810 543705 cpu.go:275] no items to output this cycle
E0321 02:36:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:36:13.409795 543705 memory.go:191] Add success.
I0321 02:36:13.409802 543705 cpu.go:282] Add success.
W0321 02:36:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:36:13.409836 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:36:13.409839 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:36:13.420073 543705 net.go:648] Add success.
I0321 02:36:13.423090 543705 net.go:770] primary dev: ETH0
I0321 02:36:13.423103 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:36:13.423115 543705 net.go:698] Add success.
I0321 02:36:13.470108 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0afc73eb-76c5-4dec-9bec-109d3d5cc455","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 02:36:13.470140 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 02:36:14.454964 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:36:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:36:14.455166 543705 disk_worker.go:708] disk space is not compliant
W0321 02:36:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:36:14.456521 543705 disk_worker.go:494] system disk:vda1
I0321 02:36:14.456565 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:36:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:36:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:36:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:36:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:36:16.472387 543705 disk_local_worker.go:436] Get disk info: []
I0321 02:36:23.409814 543705 cpu.go:275] no items to output this cycle
E0321 02:36:23.409820 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:36:23.409839 543705 memory.go:184] no items to output this cycle
I0321 02:36:27.353680 543705 disk_info.go:125] begin check local disk info of client
I0321 02:36:27.356209 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:36:27.356217 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007ba80 0xc00007bac0]
E0321 02:36:33.409796 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:36:33.409817 543705 memory.go:184] no items to output this cycle
I0321 02:36:33.409832 543705 cpu.go:275] no items to output this cycle
I0321 02:36:38.699210 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 02:36:38.699217 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 02:36:43.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:36:43.410620 543705 memory.go:191] Add success.
I0321 02:36:43.409814 543705 cpu.go:282] Add success.
I0321 02:36:43.420387 543705 net.go:648] Add success.
I0321 02:36:43.423071 543705 net.go:770] primary dev: ETH0
I0321 02:36:43.423084 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:36:43.423097 543705 net.go:698] Add success.
I0321 02:36:46.457987 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:36:46.458064 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:36:46.458090 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:36:53.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:36:53.409771 543705 memory.go:184] no items to output this cycle
I0321 02:36:53.409800 543705 cpu.go:275] no items to output this cycle
E0321 02:37:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:37:03.409780 543705 memory.go:184] no items to output this cycle
I0321 02:37:03.409802 543705 cpu.go:275] no items to output this cycle
E0321 02:37:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:37:13.409792 543705 memory.go:191] Add success.
I0321 02:37:13.409799 543705 cpu.go:282] Add success.
W0321 02:37:13.409822 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:37:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:37:13.409837 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:37:13.420451 543705 net.go:648] Add success.
I0321 02:37:13.423003 543705 net.go:770] primary dev: ETH0
I0321 02:37:13.423016 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:37:13.423028 543705 net.go:698] Add success.
I0321 02:37:13.453596 543705 event_worker.go:152] Polling the log file for events...
W0321 02:37:14.455166 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:37:14.455177 543705 disk_worker.go:708] disk space is not compliant
W0321 02:37:14.455181 543705 disk_worker.go:728] disk inode is not compliant
E0321 02:37:14.455892 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 02:37:14.455901 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 02:37:14.455907 543705 custom_config.go:64] query custom config with name: gpu
I0321 02:37:14.456538 543705 disk_worker.go:494] system disk:vda1
I0321 02:37:14.456569 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 02:37:15.456837 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 02:37:15.456846 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:37:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 02:37:16.457982 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 02:37:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:37:16.458063 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:37:16.472426 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:37:23.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:37:23.409792 543705 memory.go:184] no items to output this cycle
I0321 02:37:23.409797 543705 cpu.go:275] no items to output this cycle
I0321 02:37:27.357677 543705 disk_info.go:125] begin check local disk info of client
I0321 02:37:27.360156 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:37:27.360163 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004cc300 0xc0004cc340]
E0321 02:37:33.409773 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:37:33.409789 543705 memory.go:184] no items to output this cycle
I0321 02:37:33.409794 543705 cpu.go:275] no items to output this cycle
E0321 02:37:43.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:37:43.409796 543705 memory.go:191] Add success.
I0321 02:37:43.409814 543705 cpu.go:282] Add success.
I0321 02:37:43.419987 543705 net.go:648] Add success.
I0321 02:37:43.422571 543705 net.go:770] primary dev: ETH0
I0321 02:37:43.422586 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:37:43.422600 543705 net.go:698] Add success.
I0321 02:37:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:37:46.458035 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:37:46.458062 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:37:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:37:53.409788 543705 cpu.go:275] no items to output this cycle
I0321 02:37:53.409791 543705 memory.go:184] no items to output this cycle
E0321 02:38:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:38:03.409775 543705 memory.go:184] no items to output this cycle
I0321 02:38:03.409815 543705 cpu.go:275] no items to output this cycle
E0321 02:38:13.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:38:13.409792 543705 memory.go:191] Add success.
I0321 02:38:13.409805 543705 cpu.go:282] Add success.
W0321 02:38:13.409818 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:38:13.412348 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:38:13.412352 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:38:13.420024 543705 net.go:648] Add success.
I0321 02:38:13.421729 543705 net.go:770] primary dev: ETH0
I0321 02:38:13.421753 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:38:13.421766 543705 net.go:698] Add success.
I0321 02:38:14.454973 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:38:14.455148 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:38:14.455159 543705 disk_worker.go:708] disk space is not compliant
W0321 02:38:14.455162 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:38:14.456491 543705 disk_worker.go:494] system disk:vda1
I0321 02:38:14.456537 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:38:15.455957 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:38:16.457974 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:38:16.458038 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:38:16.458059 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:38:16.472426 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:38:23.409764 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:38:23.409782 543705 memory.go:184] no items to output this cycle
I0321 02:38:23.409845 543705 cpu.go:275] no items to output this cycle
I0321 02:38:27.361675 543705 disk_info.go:125] begin check local disk info of client
I0321 02:38:27.364159 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:38:27.364168 543705 disk_info.go:196] parse disk info done, disk is : [0xc00053b940 0xc00053b980]
E0321 02:38:33.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:38:33.409794 543705 cpu.go:275] no items to output this cycle
I0321 02:38:33.409797 543705 memory.go:184] no items to output this cycle
E0321 02:38:43.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:38:43.409790 543705 memory.go:191] Add success.
I0321 02:38:43.409813 543705 cpu.go:282] Add success.
I0321 02:38:43.419858 543705 net.go:648] Add success.
I0321 02:38:43.422870 543705 net.go:770] primary dev: ETH0
I0321 02:38:43.422883 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:38:43.422895 543705 net.go:698] Add success.
I0321 02:38:46.457986 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:38:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:38:46.458086 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:38:53.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:38:53.409788 543705 memory.go:184] no items to output this cycle
I0321 02:38:53.409790 543705 cpu.go:275] no items to output this cycle
E0321 02:39:03.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:39:03.409788 543705 memory.go:184] no items to output this cycle
I0321 02:39:03.409806 543705 cpu.go:275] no items to output this cycle
E0321 02:39:13.409881 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:39:13.409915 543705 memory.go:191] Add success.
I0321 02:39:13.409916 543705 cpu.go:282] Add success.
W0321 02:39:13.409959 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:39:13.409976 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:39:13.409981 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:39:13.419713 543705 net.go:648] Add success.
I0321 02:39:13.422446 543705 net.go:770] primary dev: ETH0
I0321 02:39:13.422461 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:39:13.422475 543705 net.go:698] Add success.
I0321 02:39:13.463830 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"0a4c26d1-48d2-4796-9c2d-eb58fd185420","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 02:39:13.463862 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 02:39:14.454972 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:39:14.455170 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:39:14.455180 543705 disk_worker.go:708] disk space is not compliant
W0321 02:39:14.455183 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:39:14.456666 543705 disk_worker.go:494] system disk:vda1
I0321 02:39:14.456694 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:39:15.455956 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:39:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:39:16.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:39:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:39:16.472435 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:39:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:39:23.409814 543705 memory.go:184] no items to output this cycle
I0321 02:39:23.409824 543705 cpu.go:275] no items to output this cycle
I0321 02:39:27.365679 543705 disk_info.go:125] begin check local disk info of client
I0321 02:39:27.368165 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:39:27.368172 543705 disk_info.go:196] parse disk info done, disk is : [0xc0000c5f00 0xc0000c5f40]
E0321 02:39:33.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:39:33.409774 543705 memory.go:184] no items to output this cycle
I0321 02:39:33.409793 543705 cpu.go:275] no items to output this cycle
I0321 02:39:38.699359 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 02:39:38.699366 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 02:39:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:39:43.410607 543705 memory.go:191] Add success.
I0321 02:39:43.409803 543705 cpu.go:282] Add success.
I0321 02:39:43.420284 543705 net.go:648] Add success.
I0321 02:39:43.422902 543705 net.go:770] primary dev: ETH0
I0321 02:39:43.422915 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:39:43.422929 543705 net.go:698] Add success.
I0321 02:39:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:39:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:39:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:39:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:39:53.409771 543705 memory.go:184] no items to output this cycle
I0321 02:39:53.409779 543705 cpu.go:275] no items to output this cycle
E0321 02:40:03.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:40:03.409768 543705 memory.go:184] no items to output this cycle
I0321 02:40:03.409804 543705 cpu.go:275] no items to output this cycle
E0321 02:40:13.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:40:13.409818 543705 memory.go:191] Add success.
I0321 02:40:13.409826 543705 cpu.go:282] Add success.
W0321 02:40:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:40:13.409877 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:40:13.409881 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:40:13.419545 543705 net.go:770] primary dev: ETH0
I0321 02:40:13.419560 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:40:13.419573 543705 net.go:698] Add success.
I0321 02:40:13.419922 543705 net.go:648] Add success.
I0321 02:40:14.454959 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:40:14.455155 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:40:14.455165 543705 disk_worker.go:708] disk space is not compliant
W0321 02:40:14.455169 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:40:14.456482 543705 disk_worker.go:494] system disk:vda1
I0321 02:40:14.456526 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:40:15.455970 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:40:16.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:40:16.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:40:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:40:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:40:23.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:40:23.409784 543705 memory.go:184] no items to output this cycle
I0321 02:40:23.409804 543705 cpu.go:275] no items to output this cycle
I0321 02:40:27.369677 543705 disk_info.go:125] begin check local disk info of client
I0321 02:40:27.372232 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:40:27.372239 543705 disk_info.go:196] parse disk info done, disk is : [0xc000375380 0xc0003753c0]
E0321 02:40:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:40:33.409778 543705 memory.go:184] no items to output this cycle
I0321 02:40:33.409780 543705 cpu.go:275] no items to output this cycle
E0321 02:40:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:40:43.409788 543705 memory.go:191] Add success.
I0321 02:40:43.409808 543705 cpu.go:282] Add success.
I0321 02:40:43.419929 543705 net.go:648] Add success.
I0321 02:40:43.422667 543705 net.go:770] primary dev: ETH0
I0321 02:40:43.422680 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:40:43.422692 543705 net.go:698] Add success.
I0321 02:40:46.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:40:46.458039 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:40:46.458063 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:40:53.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:40:53.409782 543705 memory.go:184] no items to output this cycle
I0321 02:40:53.409786 543705 cpu.go:275] no items to output this cycle
E0321 02:41:03.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:41:03.409765 543705 memory.go:184] no items to output this cycle
I0321 02:41:03.409797 543705 cpu.go:275] no items to output this cycle
E0321 02:41:13.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:41:13.409775 543705 memory.go:191] Add success.
W0321 02:41:13.409997 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0321 02:41:13.410010 543705 cpu.go:282] Add success.
W0321 02:41:13.410019 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:41:13.410023 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:41:13.419654 543705 net.go:648] Add success.
I0321 02:41:13.422595 543705 net.go:770] primary dev: ETH0
I0321 02:41:13.422608 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:41:13.422619 543705 net.go:698] Add success.
I0321 02:41:14.454978 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:41:14.455126 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:41:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0321 02:41:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:41:14.456629 543705 disk_worker.go:494] system disk:vda1
I0321 02:41:14.456666 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:41:15.455967 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:41:16.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:41:16.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:41:16.458076 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:41:16.472403 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:41:23.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:41:23.409795 543705 memory.go:184] no items to output this cycle
I0321 02:41:23.409796 543705 cpu.go:275] no items to output this cycle
I0321 02:41:27.373675 543705 disk_info.go:125] begin check local disk info of client
I0321 02:41:27.376183 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:41:27.376190 543705 disk_info.go:196] parse disk info done, disk is : [0xc00007b3c0 0xc00007b400]
E0321 02:41:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:41:33.409798 543705 memory.go:184] no items to output this cycle
I0321 02:41:33.409813 543705 cpu.go:275] no items to output this cycle
E0321 02:41:43.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:41:43.409808 543705 memory.go:191] Add success.
I0321 02:41:43.409813 543705 cpu.go:282] Add success.
I0321 02:41:43.419954 543705 net.go:648] Add success.
I0321 02:41:43.422492 543705 net.go:770] primary dev: ETH0
I0321 02:41:43.422507 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:41:43.422522 543705 net.go:698] Add success.
I0321 02:41:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:41:46.458052 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:41:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:41:53.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:41:53.409798 543705 memory.go:184] no items to output this cycle
I0321 02:41:53.409806 543705 cpu.go:275] no items to output this cycle
E0321 02:42:03.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:42:03.409776 543705 memory.go:184] no items to output this cycle
I0321 02:42:03.409778 543705 cpu.go:275] no items to output this cycle
E0321 02:42:13.409768 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:42:13.409802 543705 memory.go:191] Add success.
I0321 02:42:13.409807 543705 cpu.go:282] Add success.
W0321 02:42:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:42:13.409849 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:42:13.409853 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:42:13.420403 543705 net.go:648] Add success.
I0321 02:42:13.423161 543705 net.go:770] primary dev: ETH0
I0321 02:42:13.423174 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:42:13.423184 543705 net.go:698] Add success.
I0321 02:42:13.485848 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"68572062-e5d4-443d-83cd-8fac66229632","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 02:42:13.485879 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0321 02:42:14.455192 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:42:14.455206 543705 disk_worker.go:708] disk space is not compliant
W0321 02:42:14.455211 543705 disk_worker.go:728] disk inode is not compliant
E0321 02:42:14.456816 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 02:42:14.456826 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 02:42:14.456832 543705 custom_config.go:64] query custom config with name: gpu
I0321 02:42:14.456968 543705 disk_worker.go:494] system disk:vda1
I0321 02:42:14.457004 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 02:42:15.456817 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 02:42:15.456825 543705 custom_config.go:64] query custom config with name: huawei_npu
E0321 02:42:16.457885 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 02:42:16.457886 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:42:16.457952 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:42:16.457972 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:42:16.472283 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:42:23.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:42:23.409806 543705 memory.go:184] no items to output this cycle
I0321 02:42:23.409815 543705 cpu.go:275] no items to output this cycle
I0321 02:42:27.377675 543705 disk_info.go:125] begin check local disk info of client
I0321 02:42:27.380229 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:42:27.380237 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004ae8c0 0xc0004ae900]
E0321 02:42:33.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:42:33.409766 543705 memory.go:184] no items to output this cycle
I0321 02:42:33.409803 543705 cpu.go:275] no items to output this cycle
I0321 02:42:38.700197 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 02:42:38.700204 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 02:42:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:42:43.410656 543705 memory.go:191] Add success.
I0321 02:42:43.409788 543705 cpu.go:282] Add success.
I0321 02:42:43.420480 543705 net.go:648] Add success.
I0321 02:42:43.423042 543705 net.go:770] primary dev: ETH0
I0321 02:42:43.423056 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:42:43.423068 543705 net.go:698] Add success.
I0321 02:42:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:42:46.458050 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:42:46.458076 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:42:53.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:42:53.409793 543705 memory.go:184] no items to output this cycle
I0321 02:42:53.409802 543705 cpu.go:275] no items to output this cycle
E0321 02:43:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:43:03.409776 543705 memory.go:184] no items to output this cycle
I0321 02:43:03.409786 543705 cpu.go:275] no items to output this cycle
E0321 02:43:13.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:43:13.409786 543705 memory.go:191] Add success.
I0321 02:43:13.409787 543705 cpu.go:282] Add success.
W0321 02:43:13.409816 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:43:13.409828 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:43:13.409831 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:43:13.420318 543705 net.go:648] Add success.
I0321 02:43:13.423053 543705 net.go:770] primary dev: ETH0
I0321 02:43:13.423066 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:43:13.423077 543705 net.go:698] Add success.
I0321 02:43:14.454944 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:43:14.455200 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:43:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0321 02:43:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:43:14.456582 543705 disk_worker.go:494] system disk:vda1
I0321 02:43:14.456611 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:43:15.455954 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:43:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:43:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:43:16.458069 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:43:16.472398 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:43:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:43:23.409777 543705 memory.go:184] no items to output this cycle
I0321 02:43:23.409798 543705 cpu.go:275] no items to output this cycle
I0321 02:43:27.381676 543705 disk_info.go:125] begin check local disk info of client
I0321 02:43:27.384174 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:43:27.384180 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003d5180 0xc0003d51c0]
E0321 02:43:33.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:43:33.409797 543705 memory.go:184] no items to output this cycle
I0321 02:43:33.409809 543705 cpu.go:275] no items to output this cycle
E0321 02:43:43.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:43:43.409781 543705 memory.go:191] Add success.
I0321 02:43:43.409800 543705 cpu.go:282] Add success.
I0321 02:43:43.419868 543705 net.go:648] Add success.
I0321 02:43:43.422821 543705 net.go:770] primary dev: ETH0
I0321 02:43:43.422834 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:43:43.422846 543705 net.go:698] Add success.
I0321 02:43:46.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:43:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:43:46.458073 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:43:53.409756 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:43:53.409773 543705 memory.go:184] no items to output this cycle
I0321 02:43:53.409789 543705 cpu.go:275] no items to output this cycle
E0321 02:44:03.409754 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:44:03.409772 543705 memory.go:184] no items to output this cycle
I0321 02:44:03.409792 543705 cpu.go:275] no items to output this cycle
E0321 02:44:13.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:44:13.409784 543705 memory.go:191] Add success.
I0321 02:44:13.409788 543705 cpu.go:282] Add success.
W0321 02:44:13.409812 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:44:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:44:13.409825 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:44:13.420440 543705 net.go:648] Add success.
I0321 02:44:13.423556 543705 net.go:770] primary dev: ETH0
I0321 02:44:13.423569 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:44:13.423580 543705 net.go:698] Add success.
I0321 02:44:14.454948 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:44:14.455110 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:44:14.455174 543705 disk_worker.go:708] disk space is not compliant
W0321 02:44:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:44:14.456495 543705 disk_worker.go:494] system disk:vda1
I0321 02:44:14.456540 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:44:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:44:16.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:44:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:44:16.458074 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:44:16.472400 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:44:23.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:44:23.409778 543705 memory.go:184] no items to output this cycle
I0321 02:44:23.409800 543705 cpu.go:275] no items to output this cycle
I0321 02:44:27.385677 543705 disk_info.go:125] begin check local disk info of client
I0321 02:44:27.388254 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:44:27.388260 543705 disk_info.go:196] parse disk info done, disk is : [0xc0001aab00 0xc0001aab40]
E0321 02:44:33.409784 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:44:33.409804 543705 memory.go:184] no items to output this cycle
I0321 02:44:33.409816 543705 cpu.go:275] no items to output this cycle
E0321 02:44:43.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:44:43.409788 543705 memory.go:191] Add success.
I0321 02:44:43.409810 543705 cpu.go:282] Add success.
I0321 02:44:43.419955 543705 net.go:648] Add success.
I0321 02:44:43.422789 543705 net.go:770] primary dev: ETH0
I0321 02:44:43.422804 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:44:43.422817 543705 net.go:698] Add success.
I0321 02:44:46.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:44:46.458034 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:44:46.458056 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:44:53.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:44:53.409776 543705 memory.go:184] no items to output this cycle
I0321 02:44:53.409782 543705 cpu.go:275] no items to output this cycle
E0321 02:45:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:45:03.409799 543705 memory.go:184] no items to output this cycle
I0321 02:45:03.409816 543705 cpu.go:275] no items to output this cycle
E0321 02:45:13.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:45:13.409774 543705 memory.go:191] Add success.
W0321 02:45:13.409799 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:45:13.409811 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:45:13.409810 543705 cpu.go:282] Add success.
I0321 02:45:13.409814 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:45:13.420287 543705 net.go:648] Add success.
I0321 02:45:13.423203 543705 net.go:770] primary dev: ETH0
I0321 02:45:13.423218 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:45:13.423232 543705 net.go:698] Add success.
I0321 02:45:13.468262 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"d38c62c7-24b8-4a44-89d9-106596712bfd","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 02:45:13.468294 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 02:45:14.454957 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:45:14.455148 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:45:14.455159 543705 disk_worker.go:708] disk space is not compliant
W0321 02:45:14.455162 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:45:14.456469 543705 disk_worker.go:494] system disk:vda1
I0321 02:45:14.456513 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:45:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:45:16.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:45:16.458043 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:45:16.458061 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:45:16.472402 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:45:23.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:45:23.409776 543705 memory.go:184] no items to output this cycle
I0321 02:45:23.409807 543705 cpu.go:275] no items to output this cycle
I0321 02:45:27.392066 543705 disk_info.go:125] begin check local disk info of client
I0321 02:45:27.394711 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:45:27.394718 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004863c0 0xc000486400]
E0321 02:45:33.409762 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:45:33.409780 543705 memory.go:184] no items to output this cycle
I0321 02:45:33.409792 543705 cpu.go:275] no items to output this cycle
I0321 02:45:38.700343 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 02:45:38.700349 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 02:45:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:45:43.410723 543705 memory.go:191] Add success.
I0321 02:45:43.409824 543705 cpu.go:282] Add success.
I0321 02:45:43.420426 543705 net.go:648] Add success.
I0321 02:45:43.423403 543705 net.go:770] primary dev: ETH0
I0321 02:45:43.423416 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:45:43.423428 543705 net.go:698] Add success.
I0321 02:45:46.457981 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:45:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:45:46.458071 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:45:53.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:45:53.409778 543705 memory.go:184] no items to output this cycle
I0321 02:45:53.409809 543705 cpu.go:275] no items to output this cycle
E0321 02:46:03.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:46:03.409799 543705 memory.go:184] no items to output this cycle
I0321 02:46:03.409814 543705 cpu.go:275] no items to output this cycle
E0321 02:46:13.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:46:13.409878 543705 memory.go:191] Add success.
W0321 02:46:13.409909 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:46:13.409922 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:46:13.409925 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:46:13.409937 543705 cpu.go:282] Add success.
I0321 02:46:13.419714 543705 net.go:648] Add success.
I0321 02:46:13.422269 543705 net.go:770] primary dev: ETH0
I0321 02:46:13.422282 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:46:13.422293 543705 net.go:698] Add success.
I0321 02:46:14.454967 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:46:14.455161 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:46:14.455170 543705 disk_worker.go:708] disk space is not compliant
W0321 02:46:14.455173 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:46:14.456486 543705 disk_worker.go:494] system disk:vda1
I0321 02:46:14.456530 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:46:15.455969 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:46:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:46:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:46:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:46:16.472476 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:46:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:46:23.409779 543705 memory.go:184] no items to output this cycle
I0321 02:46:23.409797 543705 cpu.go:275] no items to output this cycle
I0321 02:46:27.397675 543705 disk_info.go:125] begin check local disk info of client
I0321 02:46:27.400290 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:46:27.400296 543705 disk_info.go:196] parse disk info done, disk is : [0xc000509200 0xc000509240]
E0321 02:46:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:46:33.409784 543705 memory.go:184] no items to output this cycle
I0321 02:46:33.409802 543705 cpu.go:275] no items to output this cycle
E0321 02:46:43.409749 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:46:43.409773 543705 memory.go:191] Add success.
I0321 02:46:43.409793 543705 cpu.go:282] Add success.
I0321 02:46:43.419895 543705 net.go:648] Add success.
I0321 02:46:43.422506 543705 net.go:770] primary dev: ETH0
I0321 02:46:43.422519 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:46:43.422532 543705 net.go:698] Add success.
I0321 02:46:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:46:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:46:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:46:53.409750 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:46:53.409765 543705 memory.go:184] no items to output this cycle
I0321 02:46:53.409793 543705 cpu.go:275] no items to output this cycle
E0321 02:47:03.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:47:03.409783 543705 memory.go:184] no items to output this cycle
I0321 02:47:03.409795 543705 cpu.go:275] no items to output this cycle
E0321 02:47:13.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:47:13.409809 543705 memory.go:191] Add success.
I0321 02:47:13.409815 543705 cpu.go:282] Add success.
W0321 02:47:13.409839 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:47:13.409860 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:47:13.409865 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:47:13.420369 543705 net.go:648] Add success.
I0321 02:47:13.423586 543705 net.go:770] primary dev: ETH0
I0321 02:47:13.423599 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:47:13.423610 543705 net.go:698] Add success.
I0321 02:47:13.452771 543705 event_worker.go:152] Polling the log file for events...
W0321 02:47:14.455174 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:47:14.455185 543705 disk_worker.go:708] disk space is not compliant
W0321 02:47:14.455188 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:47:14.456782 543705 disk_worker.go:494] system disk:vda1
I0321 02:47:14.456817 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 02:47:14.457057 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 02:47:14.457064 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 02:47:14.457068 543705 custom_config.go:64] query custom config with name: gpu
E0321 02:47:15.456861 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 02:47:15.456869 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:47:16.457960 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 02:47:16.457969 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 02:47:16.458019 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:47:16.458035 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:47:16.472379 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:47:23.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:47:23.409784 543705 memory.go:184] no items to output this cycle
I0321 02:47:23.409790 543705 cpu.go:275] no items to output this cycle
I0321 02:47:27.401674 543705 disk_info.go:125] begin check local disk info of client
I0321 02:47:27.404164 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:47:27.404171 543705 disk_info.go:196] parse disk info done, disk is : [0xc000462040 0xc000462080]
E0321 02:47:33.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:47:33.409800 543705 memory.go:184] no items to output this cycle
I0321 02:47:33.409815 543705 cpu.go:275] no items to output this cycle
E0321 02:47:43.409758 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:47:43.409781 543705 memory.go:191] Add success.
I0321 02:47:43.409789 543705 cpu.go:282] Add success.
I0321 02:47:43.419864 543705 net.go:648] Add success.
I0321 02:47:43.422551 543705 net.go:770] primary dev: ETH0
I0321 02:47:43.422564 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:47:43.422577 543705 net.go:698] Add success.
I0321 02:47:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:47:46.458059 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:47:46.458085 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:47:53.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:47:53.409773 543705 memory.go:184] no items to output this cycle
I0321 02:47:53.409798 543705 cpu.go:275] no items to output this cycle
E0321 02:48:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:48:03.409798 543705 memory.go:184] no items to output this cycle
I0321 02:48:03.409810 543705 cpu.go:275] no items to output this cycle
E0321 02:48:13.409747 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:48:13.409861 543705 memory.go:191] Add success.
W0321 02:48:13.409890 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:48:13.409902 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:48:13.409905 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:48:13.409923 543705 cpu.go:282] Add success.
I0321 02:48:13.419754 543705 net.go:648] Add success.
I0321 02:48:13.422311 543705 net.go:770] primary dev: ETH0
I0321 02:48:13.422324 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:48:13.422335 543705 net.go:698] Add success.
I0321 02:48:13.468097 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"8e775378-3de2-454a-82c8-a44d0d028471","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 02:48:13.468137 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 02:48:14.454974 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:48:14.455126 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:48:14.455191 543705 disk_worker.go:708] disk space is not compliant
W0321 02:48:14.455194 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:48:14.456523 543705 disk_worker.go:494] system disk:vda1
I0321 02:48:14.456566 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:48:15.455963 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:48:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:48:16.458037 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:48:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:48:16.472392 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:48:23.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:48:23.409790 543705 memory.go:184] no items to output this cycle
I0321 02:48:23.409795 543705 cpu.go:275] no items to output this cycle
I0321 02:48:27.405676 543705 disk_info.go:125] begin check local disk info of client
I0321 02:48:27.408504 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:48:27.408511 543705 disk_info.go:196] parse disk info done, disk is : [0xc00046f080 0xc00046f0c0]
E0321 02:48:33.409765 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:48:33.409781 543705 memory.go:184] no items to output this cycle
I0321 02:48:33.409802 543705 cpu.go:275] no items to output this cycle
I0321 02:48:38.701203 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 02:48:38.701209 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 02:48:43.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:48:43.410593 543705 memory.go:191] Add success.
I0321 02:48:43.409794 543705 cpu.go:282] Add success.
I0321 02:48:43.420366 543705 net.go:648] Add success.
I0321 02:48:43.422853 543705 net.go:770] primary dev: ETH0
I0321 02:48:43.422868 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:48:43.422885 543705 net.go:698] Add success.
I0321 02:48:46.457983 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:48:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:48:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:48:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:48:53.409768 543705 memory.go:184] no items to output this cycle
I0321 02:48:53.409786 543705 cpu.go:275] no items to output this cycle
E0321 02:49:03.409847 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:49:03.409865 543705 memory.go:184] no items to output this cycle
I0321 02:49:03.409922 543705 cpu.go:275] no items to output this cycle
E0321 02:49:13.409766 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:49:13.409792 543705 memory.go:191] Add success.
I0321 02:49:13.409798 543705 cpu.go:282] Add success.
W0321 02:49:13.409821 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:49:13.409833 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:49:13.409836 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:49:13.419988 543705 net.go:770] primary dev: ETH0
I0321 02:49:13.420000 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:49:13.420011 543705 net.go:698] Add success.
I0321 02:49:13.420359 543705 net.go:648] Add success.
I0321 02:49:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:49:14.455120 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:49:14.455186 543705 disk_worker.go:708] disk space is not compliant
W0321 02:49:14.455189 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:49:14.456509 543705 disk_worker.go:494] system disk:vda1
I0321 02:49:14.456553 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:49:15.455962 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:49:16.457993 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:49:16.458061 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:49:16.458086 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:49:16.472461 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:49:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:49:23.409783 543705 memory.go:184] no items to output this cycle
I0321 02:49:23.409786 543705 cpu.go:275] no items to output this cycle
I0321 02:49:27.409677 543705 disk_info.go:125] begin check local disk info of client
I0321 02:49:27.412166 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:49:27.412173 543705 disk_info.go:196] parse disk info done, disk is : [0xc0002bd600 0xc0002bd640]
E0321 02:49:33.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:49:33.409778 543705 memory.go:184] no items to output this cycle
I0321 02:49:33.409801 543705 cpu.go:275] no items to output this cycle
E0321 02:49:43.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:49:43.409809 543705 memory.go:191] Add success.
I0321 02:49:43.409817 543705 cpu.go:282] Add success.
I0321 02:49:43.419937 543705 net.go:648] Add success.
I0321 02:49:43.423051 543705 net.go:770] primary dev: ETH0
I0321 02:49:43.423065 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:49:43.423077 543705 net.go:698] Add success.
I0321 02:49:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:49:46.458054 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:49:46.458078 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:49:53.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:49:53.409791 543705 memory.go:184] no items to output this cycle
I0321 02:49:53.409807 543705 cpu.go:275] no items to output this cycle
E0321 02:50:03.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:50:03.409806 543705 memory.go:184] no items to output this cycle
I0321 02:50:03.409815 543705 cpu.go:275] no items to output this cycle
E0321 02:50:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:50:13.409783 543705 memory.go:191] Add success.
I0321 02:50:13.409798 543705 cpu.go:282] Add success.
W0321 02:50:13.409808 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:50:13.409819 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:50:13.409822 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:50:13.420178 543705 net.go:648] Add success.
I0321 02:50:13.422855 543705 net.go:770] primary dev: ETH0
I0321 02:50:13.422871 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:50:13.422885 543705 net.go:698] Add success.
I0321 02:50:14.454971 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:50:14.455122 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:50:14.455210 543705 disk_worker.go:708] disk space is not compliant
W0321 02:50:14.455213 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:50:14.456575 543705 disk_worker.go:494] system disk:vda1
I0321 02:50:14.456605 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:50:15.455973 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:50:16.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:50:16.458041 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:50:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:50:16.472533 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:50:23.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:50:23.409814 543705 memory.go:184] no items to output this cycle
I0321 02:50:23.409817 543705 cpu.go:275] no items to output this cycle
I0321 02:50:27.412791 543705 disk_info.go:125] begin check local disk info of client
I0321 02:50:27.415363 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:50:27.415371 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004eb880 0xc0004eb8c0]
E0321 02:50:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:50:33.409804 543705 memory.go:184] no items to output this cycle
I0321 02:50:33.409821 543705 cpu.go:275] no items to output this cycle
E0321 02:50:43.409759 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:50:43.409786 543705 memory.go:191] Add success.
I0321 02:50:43.409786 543705 cpu.go:282] Add success.
I0321 02:50:43.419855 543705 net.go:648] Add success.
I0321 02:50:43.422769 543705 net.go:770] primary dev: ETH0
I0321 02:50:43.422783 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:50:43.422795 543705 net.go:698] Add success.
I0321 02:50:46.457979 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:50:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:50:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:50:53.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:50:53.409893 543705 memory.go:184] no items to output this cycle
I0321 02:50:53.409909 543705 cpu.go:275] no items to output this cycle
E0321 02:51:03.409795 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:51:03.409814 543705 memory.go:184] no items to output this cycle
I0321 02:51:03.409829 543705 cpu.go:275] no items to output this cycle
E0321 02:51:13.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:51:13.409773 543705 memory.go:191] Add success.
W0321 02:51:13.409797 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
I0321 02:51:13.409803 543705 cpu.go:282] Add success.
W0321 02:51:13.409809 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:51:13.409813 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:51:13.420095 543705 net.go:648] Add success.
I0321 02:51:13.422998 543705 net.go:770] primary dev: ETH0
I0321 02:51:13.423010 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:51:13.423022 543705 net.go:698] Add success.
I0321 02:51:13.482823 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"599ad000-0757-4675-ac78-7adad4f7003e","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 02:51:13.482857 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 02:51:14.454977 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:51:14.455186 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:51:14.455197 543705 disk_worker.go:708] disk space is not compliant
W0321 02:51:14.455200 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:51:14.456538 543705 disk_worker.go:494] system disk:vda1
I0321 02:51:14.456587 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:51:15.455960 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:51:16.457968 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:51:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:51:16.458062 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:51:16.472384 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:51:23.409786 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:51:23.409810 543705 memory.go:184] no items to output this cycle
I0321 02:51:23.409819 543705 cpu.go:275] no items to output this cycle
I0321 02:51:27.415802 543705 disk_info.go:125] begin check local disk info of client
I0321 02:51:27.418308 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:51:27.418314 543705 disk_info.go:196] parse disk info done, disk is : [0xc000462600 0xc000462640]
E0321 02:51:33.409780 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:51:33.409799 543705 memory.go:184] no items to output this cycle
I0321 02:51:33.409815 543705 cpu.go:275] no items to output this cycle
I0321 02:51:38.701737 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 02:51:38.701744 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 02:51:43.409778 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:51:43.410692 543705 memory.go:191] Add success.
I0321 02:51:43.409819 543705 cpu.go:282] Add success.
I0321 02:51:43.420500 543705 net.go:648] Add success.
I0321 02:51:43.423054 543705 net.go:770] primary dev: ETH0
I0321 02:51:43.423069 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:51:43.423083 543705 net.go:698] Add success.
I0321 02:51:46.457980 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:51:46.458055 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:51:46.458080 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:51:53.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:51:53.409783 543705 memory.go:184] no items to output this cycle
I0321 02:51:53.409798 543705 cpu.go:275] no items to output this cycle
E0321 02:52:03.409744 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:52:03.409760 543705 memory.go:184] no items to output this cycle
I0321 02:52:03.409795 543705 cpu.go:275] no items to output this cycle
E0321 02:52:13.409769 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:52:13.409796 543705 memory.go:191] Add success.
I0321 02:52:13.409801 543705 cpu.go:282] Add success.
W0321 02:52:13.409823 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:52:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:52:13.409838 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:52:13.420382 543705 net.go:648] Add success.
I0321 02:52:13.423360 543705 net.go:770] primary dev: ETH0
I0321 02:52:13.423373 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:52:13.423386 543705 net.go:698] Add success.
W0321 02:52:14.455105 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:52:14.455164 543705 disk_worker.go:708] disk space is not compliant
W0321 02:52:14.455167 543705 disk_worker.go:728] disk inode is not compliant
E0321 02:52:14.456948 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 02:52:14.456957 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 02:52:14.456963 543705 custom_config.go:64] query custom config with name: gpu
I0321 02:52:14.457007 543705 disk_worker.go:494] system disk:vda1
I0321 02:52:14.457035 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 02:52:15.456834 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 02:52:15.456843 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:52:16.457992 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 02:52:16.457992 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 02:52:16.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:52:16.458068 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:52:16.472380 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:52:23.409792 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:52:23.409814 543705 memory.go:184] no items to output this cycle
I0321 02:52:23.409822 543705 cpu.go:275] no items to output this cycle
I0321 02:52:27.418794 543705 disk_info.go:125] begin check local disk info of client
I0321 02:52:27.421350 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:52:27.421357 543705 disk_info.go:196] parse disk info done, disk is : [0xc00032b9c0 0xc00032ba00]
E0321 02:52:33.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:52:33.409801 543705 memory.go:184] no items to output this cycle
I0321 02:52:33.409813 543705 cpu.go:275] no items to output this cycle
E0321 02:52:43.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:52:43.409932 543705 memory.go:191] Add success.
I0321 02:52:43.409939 543705 cpu.go:282] Add success.
I0321 02:52:43.419713 543705 net.go:648] Add success.
I0321 02:52:43.422512 543705 net.go:770] primary dev: ETH0
I0321 02:52:43.422525 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:52:43.422536 543705 net.go:698] Add success.
I0321 02:52:46.457975 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:52:46.458046 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:52:46.458070 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:52:53.409746 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:52:53.409761 543705 memory.go:184] no items to output this cycle
I0321 02:52:53.409791 543705 cpu.go:275] no items to output this cycle
E0321 02:53:03.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:53:03.409787 543705 memory.go:184] no items to output this cycle
I0321 02:53:03.409788 543705 cpu.go:275] no items to output this cycle
E0321 02:53:13.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:53:13.409808 543705 memory.go:191] Add success.
I0321 02:53:13.409820 543705 cpu.go:282] Add success.
W0321 02:53:13.409834 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:53:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:53:13.409848 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:53:13.420137 543705 net.go:648] Add success.
I0321 02:53:13.423096 543705 net.go:770] primary dev: ETH0
I0321 02:53:13.423111 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:53:13.423125 543705 net.go:698] Add success.
I0321 02:53:14.454958 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:53:14.455097 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:53:14.455155 543705 disk_worker.go:708] disk space is not compliant
W0321 02:53:14.455158 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:53:14.456520 543705 disk_worker.go:494] system disk:vda1
I0321 02:53:14.456562 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:53:15.455958 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:53:16.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:53:16.458040 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:53:16.458066 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:53:16.472401 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:53:23.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:53:23.409811 543705 memory.go:184] no items to output this cycle
I0321 02:53:23.409819 543705 cpu.go:275] no items to output this cycle
I0321 02:53:27.421806 543705 disk_info.go:125] begin check local disk info of client
I0321 02:53:27.424365 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:53:27.424372 543705 disk_info.go:196] parse disk info done, disk is : [0xc00034a880 0xc00034a8c0]
E0321 02:53:33.409782 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:53:33.409803 543705 memory.go:184] no items to output this cycle
I0321 02:53:33.409814 543705 cpu.go:275] no items to output this cycle
E0321 02:53:43.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:53:43.409780 543705 memory.go:191] Add success.
I0321 02:53:43.409812 543705 cpu.go:282] Add success.
I0321 02:53:43.419868 543705 net.go:648] Add success.
I0321 02:53:43.422684 543705 net.go:770] primary dev: ETH0
I0321 02:53:43.422784 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:53:43.422801 543705 net.go:698] Add success.
I0321 02:53:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:53:46.458053 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:53:46.458079 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:53:53.409767 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:53:53.409783 543705 memory.go:184] no items to output this cycle
I0321 02:53:53.409802 543705 cpu.go:275] no items to output this cycle
E0321 02:54:03.409777 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:54:03.409798 543705 memory.go:184] no items to output this cycle
I0321 02:54:03.409800 543705 cpu.go:275] no items to output this cycle
E0321 02:54:13.409772 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:54:13.409797 543705 memory.go:191] Add success.
I0321 02:54:13.409798 543705 cpu.go:282] Add success.
W0321 02:54:13.409825 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:54:13.409837 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:54:13.409841 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:54:13.420066 543705 net.go:648] Add success.
I0321 02:54:13.423065 543705 net.go:770] primary dev: ETH0
I0321 02:54:13.423079 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:54:13.423091 543705 net.go:698] Add success.
I0321 02:54:13.465812 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"640fe46a-f6ee-4cc5-97fc-a35083636044","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 02:54:13.465847 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
I0321 02:54:14.454963 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:54:14.455113 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:54:14.455175 543705 disk_worker.go:708] disk space is not compliant
W0321 02:54:14.455177 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:54:14.456511 543705 disk_worker.go:494] system disk:vda1
I0321 02:54:14.456556 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:54:15.455965 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:54:16.457970 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:54:16.458029 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:54:16.458049 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:54:16.472387 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:54:23.409781 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:54:23.409797 543705 memory.go:184] no items to output this cycle
I0321 02:54:23.409798 543705 cpu.go:275] no items to output this cycle
I0321 02:54:27.424836 543705 disk_info.go:125] begin check local disk info of client
I0321 02:54:27.427377 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:54:27.427384 543705 disk_info.go:196] parse disk info done, disk is : [0xc0004dfd00 0xc0004dfd40]
E0321 02:54:33.409774 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:54:33.409796 543705 memory.go:184] no items to output this cycle
I0321 02:54:33.409807 543705 cpu.go:275] no items to output this cycle
I0321 02:54:38.703212 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 02:54:38.703218 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 02:54:43.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:54:43.410595 543705 memory.go:191] Add success.
I0321 02:54:43.409830 543705 cpu.go:282] Add success.
I0321 02:54:43.420278 543705 net.go:648] Add success.
I0321 02:54:43.422901 543705 net.go:770] primary dev: ETH0
I0321 02:54:43.422916 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:54:43.422930 543705 net.go:698] Add success.
I0321 02:54:46.458009 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:54:46.458191 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:54:46.458218 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:54:53.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:54:53.409805 543705 memory.go:184] no items to output this cycle
I0321 02:54:53.409819 543705 cpu.go:275] no items to output this cycle
E0321 02:55:03.409775 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:55:03.409791 543705 memory.go:184] no items to output this cycle
I0321 02:55:03.409798 543705 cpu.go:275] no items to output this cycle
E0321 02:55:13.409771 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:55:13.409802 543705 memory.go:191] Add success.
I0321 02:55:13.409808 543705 cpu.go:282] Add success.
W0321 02:55:13.409832 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:55:13.409845 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:55:13.409847 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:55:13.420089 543705 net.go:648] Add success.
I0321 02:55:13.423158 543705 net.go:770] primary dev: ETH0
I0321 02:55:13.423171 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:55:13.423183 543705 net.go:698] Add success.
I0321 02:55:14.454970 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:55:14.455206 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:55:14.455216 543705 disk_worker.go:708] disk space is not compliant
W0321 02:55:14.455218 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:55:14.456614 543705 disk_worker.go:494] system disk:vda1
I0321 02:55:14.456645 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:55:15.455964 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:55:16.457965 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:55:16.458026 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:55:16.458051 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:55:16.472425 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:55:23.409787 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:55:23.409810 543705 memory.go:184] no items to output this cycle
I0321 02:55:23.409828 543705 cpu.go:275] no items to output this cycle
I0321 02:55:27.427846 543705 disk_info.go:125] begin check local disk info of client
I0321 02:55:27.430279 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:55:27.430285 543705 disk_info.go:196] parse disk info done, disk is : [0xc0003dc040 0xc0003dc080]
E0321 02:55:33.409776 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:55:33.409799 543705 memory.go:184] no items to output this cycle
I0321 02:55:33.409809 543705 cpu.go:275] no items to output this cycle
E0321 02:55:43.409755 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:55:43.409781 543705 memory.go:191] Add success.
I0321 02:55:43.409804 543705 cpu.go:282] Add success.
I0321 02:55:43.419868 543705 net.go:648] Add success.
I0321 02:55:43.422927 543705 net.go:770] primary dev: ETH0
I0321 02:55:43.422942 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:55:43.422955 543705 net.go:698] Add success.
I0321 02:55:46.457976 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:55:46.458045 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:55:46.458074 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:55:53.409751 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:55:53.409768 543705 memory.go:184] no items to output this cycle
I0321 02:55:53.409796 543705 cpu.go:275] no items to output this cycle
E0321 02:56:03.409788 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:56:03.409809 543705 memory.go:184] no items to output this cycle
I0321 02:56:03.409822 543705 cpu.go:275] no items to output this cycle
E0321 02:56:13.409757 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:56:13.409787 543705 memory.go:191] Add success.
I0321 02:56:13.409803 543705 cpu.go:282] Add success.
W0321 02:56:13.409813 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:56:13.409824 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:56:13.409827 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:56:13.420068 543705 net.go:648] Add success.
I0321 02:56:13.422694 543705 net.go:770] primary dev: ETH0
I0321 02:56:13.422709 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:56:13.422724 543705 net.go:698] Add success.
I0321 02:56:14.454969 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:56:14.455197 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:56:14.455207 543705 disk_worker.go:708] disk space is not compliant
W0321 02:56:14.455210 543705 disk_worker.go:728] disk inode is not compliant
I0321 02:56:14.456597 543705 disk_worker.go:494] system disk:vda1
I0321 02:56:14.456628 543705 disk_worker.go:432] add disk info successfully, len:33
I0321 02:56:15.455961 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:56:16.457972 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:56:16.458032 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:56:16.458054 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:56:16.472355 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:56:23.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:56:23.409777 543705 memory.go:184] no items to output this cycle
I0321 02:56:23.409796 543705 cpu.go:275] no items to output this cycle
I0321 02:56:27.430848 543705 disk_info.go:125] begin check local disk info of client
I0321 02:56:27.433382 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:56:27.433389 543705 disk_info.go:196] parse disk info done, disk is : [0xc000278900 0xc000278940]
E0321 02:56:33.409779 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:56:33.409798 543705 memory.go:184] no items to output this cycle
I0321 02:56:33.409812 543705 cpu.go:275] no items to output this cycle
E0321 02:56:43.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:56:43.409791 543705 memory.go:191] Add success.
I0321 02:56:43.409795 543705 cpu.go:282] Add success.
I0321 02:56:43.419965 543705 net.go:648] Add success.
I0321 02:56:43.422507 543705 net.go:770] primary dev: ETH0
I0321 02:56:43.422520 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:56:43.422533 543705 net.go:698] Add success.
I0321 02:56:46.457977 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:56:46.458044 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:56:46.458067 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:56:53.409752 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:56:53.409770 543705 memory.go:184] no items to output this cycle
I0321 02:56:53.409787 543705 cpu.go:275] no items to output this cycle
E0321 02:57:03.409770 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:57:03.409780 543705 cpu.go:275] no items to output this cycle
I0321 02:57:03.409785 543705 memory.go:184] no items to output this cycle
E0321 02:57:13.409785 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:57:13.409808 543705 cpu.go:282] Add success.
I0321 02:57:13.409814 543705 memory.go:191] Add success.
W0321 02:57:13.409841 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:57:13.409854 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:57:13.409857 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:57:13.420161 543705 net.go:648] Add success.
I0321 02:57:13.423001 543705 net.go:770] primary dev: ETH0
I0321 02:57:13.423014 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:57:13.423026 543705 net.go:698] Add success.
I0321 02:57:13.429076 543705 net.go:1082] primary mac is: fa:27:00:10:e8:f4,dev: ETH0
I0321 02:57:13.453249 543705 event_worker.go:152] Polling the log file for events...
I0321 02:57:13.504210 543705 http_client.go:75] url:http://bcm.baidubce.com/csm/api/v1/agentControl?instance=b68fcff7-5bec-43da-a6d9-4d16e8b44d62&uid=fde98ef1ae594742bdfe91499b97205e, resp:{"requestId":"b31e7dae-75a1-4dc3-bc1c-dddafd36c766","message":"","success":true,"result":{"userId":"fde98ef1ae594742bdfe91499b97205e","instance":{"uuid":"b68fcff7-5bec-43da-a6d9-4d16e8b44d62","shortId":"","type":""},"controls":[{"name":"rdma","enable":true},{"name":"gpu","enable":false}],"rdmaControls":null},"code":200}
I0321 02:57:13.504251 543705 custom_config.go:56] updated config: {
"gpu": {
"Name": "gpu",
"Enable": false
},
"rdma": {
"Name": "rdma",
"Enable": true
}
}
W0321 02:57:14.455136 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:57:14.455199 543705 disk_worker.go:708] disk space is not compliant
W0321 02:57:14.455202 543705 disk_worker.go:728] disk inode is not compliant
E0321 02:57:14.455906 543705 exec_cmd.go:13] Can not obtain CombinedOutput for "dcgmi discovery --list", operation name: check dcgm discover, error:exit status 127
E0321 02:57:14.455914 543705 gpu_controllor.go:82] DCGM discovery failed: exit status 127
I0321 02:57:14.455920 543705 custom_config.go:64] query custom config with name: gpu
I0321 02:57:14.456560 543705 disk_worker.go:494] system disk:vda1
I0321 02:57:14.456589 543705 disk_worker.go:432] add disk info successfully, len:33
E0321 02:57:15.456820 543705 npu_controller.go:127] Fail to call npu-smi, exit status 127, bash: line 1: /usr/local/sbin/npu-smi: No such file or directory
I0321 02:57:15.456829 543705 custom_config.go:64] query custom config with name: huawei_npu
I0321 02:57:16.457946 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
E0321 02:57:16.457955 543705 rdma_controller.go:118] fail to call which hccn_too, err:exit status 1, output:
I0321 02:57:16.457997 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:57:16.458016 543705 gpu_kunlun.go:227] Add success, len:1
I0321 02:57:16.472346 543705 disk_local_worker.go:436] Get disk info: []
E0321 02:57:23.409763 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:57:23.409782 543705 memory.go:184] no items to output this cycle
I0321 02:57:23.409800 543705 cpu.go:275] no items to output this cycle
I0321 02:57:27.433861 543705 disk_info.go:125] begin check local disk info of client
I0321 02:57:27.436434 543705 disk_info.go:161] parse disk info: sr0 QM00051
vda v-xC6VYz3d
I0321 02:57:27.436441 543705 disk_info.go:196] parse disk info done, disk is : [0xc00053a840 0xc00053a880]
E0321 02:57:33.409760 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:57:33.409775 543705 memory.go:184] no items to output this cycle
I0321 02:57:33.409780 543705 cpu.go:275] no items to output this cycle
I0321 02:57:38.704215 543705 nvml_worker.go:363] ERROR_LIBRARY_NOT_FOUND
I0321 02:57:38.704222 543705 nvml_worker.go:378] NVIDIA driver is not ready, try again in 1 minute.
E0321 02:57:43.409783 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:57:43.410619 543705 memory.go:191] Add success.
I0321 02:57:43.409824 543705 cpu.go:282] Add success.
I0321 02:57:43.420334 543705 net.go:648] Add success.
I0321 02:57:43.423031 543705 net.go:770] primary dev: ETH0
I0321 02:57:43.423043 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:57:43.423058 543705 net.go:698] Add success.
I0321 02:57:46.457984 543705 gpu_kunlun.go:233] start to get kun lun stat by xpuml
I0321 02:57:46.458049 543705 gpu_kunlun.go:354] start to get kun lun stat by xpu_smi
I0321 02:57:46.458075 543705 gpu_kunlun.go:227] Add success, len:1
E0321 02:57:53.409873 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:57:53.409893 543705 memory.go:184] no items to output this cycle
I0321 02:57:53.409951 543705 cpu.go:275] no items to output this cycle
E0321 02:58:03.409761 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:58:03.409779 543705 memory.go:184] no items to output this cycle
I0321 02:58:03.409800 543705 cpu.go:275] no items to output this cycle
E0321 02:58:13.409789 543705 memory.go:175] total swap size <= 0, can not cal swap used percent
I0321 02:58:13.409826 543705 memory.go:191] Add success.
I0321 02:58:13.409831 543705 cpu.go:282] Add success.
W0321 02:58:13.409858 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ce_count
W0321 02:58:13.409874 543705 memory.go:428] No files found matching pattern: /sys/devices/system/edac/mc/mc*/ue_count
I0321 02:58:13.409878 543705 memory.go:414] ECC metrics: ce_delta=0, ue_delta=0
I0321 02:58:13.420170 543705 net.go:648] Add success.
I0321 02:58:13.423233 543705 net.go:770] primary dev: ETH0
I0321 02:58:13.423246 543705 net.go:802] Send network stats successfully!,count is 6
I0321 02:58:13.423258 543705 net.go:698] Add success.
I0321 02:58:14.454966 543705 custom_config.go:64] query custom config with name: gpu
W0321 02:58:14.455174 543705 disk_worker.go:1014] failed to get sector size for device ramfs: lstat /sys/class/block/ramfs: no such file or directory
W0321 02:58:14.455184 543705 disk_worke